diff --git a/cmake_targets/CMakeLists.txt b/cmake_targets/CMakeLists.txt index 5a682535e046c269ca7ffd5cc87c05fef850e211..4f482cb6603a17b8aa60b8397ddfdeafda0d3735 100644 --- a/cmake_targets/CMakeLists.txt +++ b/cmake_targets/CMakeLists.txt @@ -138,8 +138,8 @@ if (CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7l") else (CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7l") if(EXISTS "/proc/cpuinfo") file(STRINGS "/proc/cpuinfo" CPUINFO REGEX flags LIMIT_COUNT 1) - if (CPUINFO MATCHES "avx512f") - set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -march=skylake-avx512 -mtune=skylake-avx512") + if (CPUINFO MATCHES "avx512bw") + set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -mavx512bw") set(COMPILATION_AVX2 "True") else() if (CPUINFO MATCHES "avx2") diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_decoder.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_decoder.c index 13223a1e4068749d24ab247e7f2aa6c3c99afde4..893eee5574b06f0d03d8240c35d346c74aab068c 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_decoder.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_decoder.c @@ -333,8 +333,8 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, t_nrLDP { case 384: { - // nrLDPC_cnProc_BG1_Z384_13_AVX2(p_procBuf->cnProcBuf,p_procBuf->cnProcBufRes); - nrLDPC_cnProc_BG1_Z384_13_AVX512(p_procBuf->cnProcBuf,p_procBuf->cnProcBufRes); //we test here + //nrLDPC_cnProc_BG1_Z384_13_AVX2(p_procBuf->cnProcBuf,p_procBuf->cnProcBufRes); + nrLDPC_cnProc_BG1_Z384_13_AVX512(p_procBuf->cnProcBuf,p_procBuf->cnProcBufRes); //we test here break; } case 352: @@ -1475,7 +1475,7 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, t_nrLDP case 384: { nrLDPC_cnProc_BG2_Z384_13_AVX2(p_procBuf->cnProcBuf,p_procBuf->cnProcBufRes); - // nrLDPC_cnProc_BG2_Z384_13_AVX512(p_procBuf->cnProcBuf,p_procBuf->cnProcBufRes); + //nrLDPC_cnProc_BG2_Z384_13_AVX512(p_procBuf->cnProcBuf,p_procBuf->cnProcBufRes); break; } case 352: @@ -1922,7 +1922,7 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, t_nrLDP { case 384: { - // nrLDPC_cnProc_BG1_Z384_13_AVX2(p_procBuf->cnProcBuf,p_procBuf->cnProcBufRes); + //nrLDPC_cnProc_BG1_Z384_13_AVX2(p_procBuf->cnProcBuf,p_procBuf->cnProcBufRes); nrLDPC_cnProc_BG1_Z384_13_AVX512(p_procBuf->cnProcBuf,p_procBuf->cnProcBufRes); //we test here break; } @@ -2273,7 +2273,7 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, t_nrLDP case 384: { nrLDPC_cnProc_BG2_Z384_13_AVX2(p_procBuf->cnProcBuf,p_procBuf->cnProcBufRes); - // nrLDPC_cnProc_BG2_Z384_13_AVX512(p_procBuf->cnProcBuf,p_procBuf->cnProcBufRes); + //nrLDPC_cnProc_BG2_Z384_13_AVX512(p_procBuf->cnProcBuf,p_procBuf->cnProcBufRes); break; } case 352: diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_init_mem.h b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_init_mem.h index 3292c0debefc82ffb5ef9dbc71d6c2d39791747d..acd40d5d2d305e3fdaa53705c7ec4986b5872089 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_init_mem.h +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_init_mem.h @@ -42,7 +42,7 @@ */ static inline void* malloc32_clear(size_t size) { - void* ptr = (void*) memalign(32, size+32); + void* ptr = (void*) memalign(64, size+64); memset(ptr, 0, size); return ptr; } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_avx2/Makefile b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_avx2/Makefile index 5f58a090ecd5f9e4012c7fc25d3f892b750427a8..a3bee96b470aafe00211f3d4376d03c8d2582f80 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_avx2/Makefile +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_avx2/Makefile @@ -8,12 +8,12 @@ OBJ= $(SRC:.c=.o) all: $(EXEC) cnProc_gen_avx2: $(OBJ) - @$(CC) -o $@ $^ $(LDFLAGS) -O2 + @$(CC) -o $@ $^ $(LDFLAGS) -O2 #main.o: cnProc_gen_avx2.h %.o: %.c - @$(CC) -o $@ -c $< $(CFLAGS) -I ${OPENAIR_HOME}/openair1 -g + @$(CC) -o $@ -c $< $(CFLAGS) -I ${OPENAIR_HOME}/openair1 -g -std=c99 .PHONY: clean mrproper diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_avx2/cnProc_gen_BG1_avx2.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_avx2/cnProc_gen_BG1_avx2.c index 562caa6f262e696bf37b74376aece19376077f83..c9b371038f1c67e6c92afad7112e10aaeb53a154 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_avx2/cnProc_gen_BG1_avx2.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_avx2/cnProc_gen_BG1_avx2.c @@ -21,7 +21,6 @@ void nrLDPC_cnProc_BG1_generator_AVX2(uint16_t Z,int R) fprintf(fd,"#include <stdint.h>\n"); fprintf(fd,"#include <immintrin.h>\n"); - fprintf(fd,"#include "\"../include/avx512fintrin.h>\"\n"); fprintf(fd,"void nrLDPC_cnProc_BG1_Z%d_%s_AVX2(int8_t* cnProcBuf,int8_t* cnProcBufRes) {\n",Z,ratestr[R]); @@ -411,10 +410,10 @@ void nrLDPC_cnProc_BG1_generator_AVX2(uint16_t Z,int R) // Process group with 8 BNs fprintf(fd,"//Process group with 8 BNs\n"); // Offset is 2*384/32 = 24 - const uint8_t lut_idxCnProcG8[8][7] = {{24,48,72,96,120,144,168}, {0,48,72,96,120,144,168}, - {0,24,72,96,120,144,168}, {0,24,48,96,120,144,168}, - {0,24,48,72,120,144,168}, {0,24,48,72,96,144,168}, - {0,24,48,72,96,120,168}, {0,24,48,72,96,120,144}}; + const uint8_t lut_idxCnProcG8[8][7] = {{24,48,72,96,112,144,168}, {0,48,72,96,112,144,168}, + {0,24,72,96,112,144,168}, {0,24,48,96,112,144,168}, + {0,24,48,72,112,144,168}, {0,24,48,72,96,144,168}, + {0,24,48,72,96,112,168}, {0,24,48,72,96,112,144}}; diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_avx2/cnProc_gen_avx2 b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_avx2/cnProc_gen_avx2 old mode 100644 new mode 100755 index 1d463e5c746fd4c49d0a36ca10bc7646c83de306..0f56660ba8eefa85a426103f64a464495a7c0566 Binary files a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_avx2/cnProc_gen_avx2 and b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_avx2/cnProc_gen_avx2 differ diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_avx512/Makefile b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_avx512/Makefile index 5d80ba79e83b9a998ab1559bd4ea44710e9f059f..b17a7fe62a35d17335979a812d7e1ee1a7312ecb 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_avx512/Makefile +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_avx512/Makefile @@ -13,7 +13,7 @@ cnProc_gen_avx512: $(OBJ) #main.o: cnProc_gen_avx512.h %.o: %.c - @$(CC) -o $@ -c $< $(CFLAGS) -I ${OPENAIR_HOME}/openair1 -g + @$(CC) -o $@ -c $< $(CFLAGS) -I ${OPENAIR_HOME}/openair1 -g -std=c99 .PHONY: clean mrproper diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_avx512/cnProc_gen_BG1_avx512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_avx512/cnProc_gen_BG1_avx512.c index a6ffe84223a6251ba5cd3eb709cd242a3c80d4d4..b4ccb0a90614c248d9bb3bc6f2961fb25f218a12 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_avx512/cnProc_gen_BG1_avx512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_avx512/cnProc_gen_BG1_avx512.c @@ -1,8 +1,6 @@ +#include <stdio.h> #include <stdint.h> -#include <immintrin.h> #include "../../nrLDPCdecoder_defs.h" -#include "../../nrLDPC_types.h" -#include "../../nrLDPC_bnProc.h" void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) { @@ -22,12 +20,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) fprintf(fd,"#include <immintrin.h>\n"); - fprintf(fd, "__m512i _mm512_sign_epi16(__m512i a, __m512i b){ \n"); /* Emulate _mm512_sign_epi16() with instructions that exist in the AVX-512 instruction set */ - fprintf(fd, "b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); \n" ); - fprintf(fd, "b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); \n" ); - fprintf(fd, " a = _mm512_mullo_epi16(a, b);\n"); - fprintf(fd, "return a;\n"); - fprintf(fd, "}\n" ); + fprintf(fd, "#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a)\n"); fprintf(fd,"void nrLDPC_cnProc_BG1_Z%d_%s_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) {\n",Z,ratestr[R]); @@ -69,21 +62,21 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) // Offsets are in units of bitOffsetInGroup (1*384/64)=6 // Offsets are in units of bitOffsetInGroup (1*384/64)=6 - const uint8_t lut_idxCnProcG3[3][2] = {{6,12}, {0,12}, {0,6}}; + const uint8_t lut_idxCnProcG3[3][2] = {{12,24}, {0,24}, {0,12}}; - fprintf(fd," __m512i zmm0, min, sgn,ones,maxLLR;\n"); - fprintf(fd," ones = _mm512_set1_epi8((char)1);\n"); + fprintf(fd," __m512i zmm0, min, sgn,zeros,maxLLR;\n"); + fprintf(fd," zeros = _mm512_setzero_si512();\n"); fprintf(fd," maxLLR = _mm512_set1_epi8((char)127);\n"); if (lut_numCnInCnGroups[0] > 0) { // Number of groups of 64 CNs for parallel processing // Ceil for values not divisible by 64 - M = (lut_numCnInCnGroups[0]*Z + 63)>>5; + M = (lut_numCnInCnGroups[0]*Z + 63)>>6; // Set the offset to each bit within a group in terms of 64 Byte - bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[0]*NR_LDPC_ZMAX)>>5; + bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[0]*NR_LDPC_ZMAX)>>6; // Set pointers to start of group 3 @@ -104,53 +97,51 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) fprintf(fd," for (int i=0;i<%d;i+=2) {\n",M); // Abs and sign of 64 CNs (first BN) // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; - fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>5)+lut_idxCnProcG3[j][0]); - // sgn = _mm512_sign_epi16(ones, zmm0); - fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n"); + fprintf(fd," sgn = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>6)+lut_idxCnProcG3[j][0]/2); + // sgn = _mm512_sign_epi8(ones, zmm0); // min = _mm512_abs_epi8(zmm0); - fprintf(fd," min = _mm512_abs_epi8(zmm0);\n"); + fprintf(fd," min = _mm512_abs_epi8(sgn);\n"); // 32 CNs of second BN // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][1] + i]; - fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>5)+lut_idxCnProcG3[j][1]); + fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>6)+lut_idxCnProcG3[j][1]/2); // min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n"); - // sgn = _mm512_sign_epi16(sgn, zmm0); - fprintf(fd," sgn = _mm512_sign_epi16(sgn, zmm0);\n"); + // sgn = _mm512_sign_epi8(sgn, zmm0); + fprintf(fd," sgn = _mm512_xor_si512(sgn, zmm0);\n"); // Store result // min = _mm512_min_epu8(min, maxLLR); // 128 in epi8 is -127 fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n"); - // *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn); + // *p_cnProcBufResBit = _mm512_sign_epi8(min, sgn); // p_cnProcBufResBit++; - fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[0]>>5)+(j*bitOffsetInGroup)); + fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = conditional_negate(min, sgn,zeros);\n",(lut_startAddrCnGroups[0]>>6)+(j*bitOffsetInGroup)); // Abs and sign of 64 CNs (first BN) // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; - fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>5)+lut_idxCnProcG3[j][0]+1); - // sgn = _mm512_sign_epi16(ones, zmm0); - fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n"); + fprintf(fd," sgn = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>6)+(lut_idxCnProcG3[j][0]/2)+1); + // sgn = _mm512_sign_epi8(ones, zmm0); // min = _mm512_abs_epi8(zmm0); - fprintf(fd," min = _mm512_abs_epi8(zmm0);\n"); + fprintf(fd," min = _mm512_abs_epi8(sgn);\n"); // 32 CNs of second BN // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][1] + i]; - fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>5)+lut_idxCnProcG3[j][1]+1); + fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>6)+(lut_idxCnProcG3[j][1]/2)+1); // min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n"); - // sgn = _mm512_sign_epi16(sgn, zmm0); - fprintf(fd," sgn = _mm512_sign_epi16(sgn, zmm0);\n"); + // sgn = _mm512_sign_epi8(sgn, zmm0); + fprintf(fd," sgn = _mm512_xor_si512(sgn, zmm0);\n"); // Store result // min = _mm512_min_epu8(min, maxLLR); // 128 in epi8 is -127 fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n"); - // *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn); + // *p_cnProcBufResBit = _mm512_sign_epi8(min, sgn); // p_cnProcBufResBit++; - fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[0]>>5)+(j*bitOffsetInGroup)+1); + fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = conditional_negate(min, sgn,zeros);\n",(lut_startAddrCnGroups[0]>>6)+(j*bitOffsetInGroup)+1); fprintf(fd," }\n"); } @@ -160,16 +151,16 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) // Process group with 4 BNs fprintf(fd,"//Process group with 4 BNs\n"); // Offset is 5*384/64 = 30 - const uint8_t lut_idxCnProcG4[4][3] = {{30,60,90}, {0,60,90}, {0,30,90}, {0,30,60}}; + const uint8_t lut_idxCnProcG4[4][3] = {{60,120,180}, {0,120,180}, {0,60,180}, {0,60,120}}; if (lut_numCnInCnGroups[1] > 0) { // Number of groups of 64 CNs for parallel processing // Ceil for values not divisible by 64 - M = (lut_numCnInCnGroups[1]*Z + 63)>>5; + M = (lut_numCnInCnGroups[1]*Z + 63)>>6; // Set the offset to each bit within a group in terms of 64 Byte - bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[1]*NR_LDPC_ZMAX)>>5; + bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[1]*NR_LDPC_ZMAX)>>6; // Set pointers to start of group 4 @@ -188,31 +179,30 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) fprintf(fd," for (int i=0;i<%d;i++) {\n",M); // Abs and sign of 64 CNs (first BN) // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; - fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[1]>>5)+lut_idxCnProcG4[j][0]); - // sgn = _mm512_sign_epi16(ones, zmm0); - fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n"); + fprintf(fd," sgn = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[1]>>6)+lut_idxCnProcG4[j][0]/2); + // sgn = _mm512_sign_epi8(ones, zmm0); // min = _mm512_abs_epi8(zmm0); - fprintf(fd," min = _mm512_abs_epi8(zmm0);\n"); + fprintf(fd," min = _mm512_abs_epi8(sgn);\n"); // Loop over BNs for (k=1; k<3; k++) { - fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[1]>>5)+lut_idxCnProcG4[j][k]); + fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[1]>>6)+lut_idxCnProcG4[j][k]/2); // min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n"); - // sgn = _mm512_sign_epi16(sgn, zmm0); - fprintf(fd," sgn = _mm512_sign_epi16(sgn, zmm0);\n"); + // sgn = _mm512_sign_epi8(sgn, zmm0); + fprintf(fd," sgn = _mm512_xor_si512(sgn, zmm0);\n"); } // Store result // min = _mm512_min_epu8(min, maxLLR); // 128 in epi8 is -127 fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n"); - // *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn); + // *p_cnProcBufResBit = _mm512_sign_epi8(min, sgn); // p_cnProcBufResBit++; - fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[1]>>5)+(j*bitOffsetInGroup)); + fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = conditional_negate(min, sgn,zeros);\n",(lut_startAddrCnGroups[1]>>6)+(j*bitOffsetInGroup)); fprintf(fd," }\n"); } } @@ -222,18 +212,18 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) // Process group with 5 BNs fprintf(fd,"//Process group with 5 BNs\n"); // Offset is 18*384/64 = 216 - const uint16_t lut_idxCnProcG5[5][4] = {{108,216,324,432}, {0,216,324,432}, - {0,108,324,432}, {0,108,216,432}, {0,108,216,324}}; + const uint16_t lut_idxCnProcG5[5][4] = {{216,432,648,864}, {0,432,648,864}, + {0,216,648,864}, {0,216,432,864}, {0,216,432,648}}; if (lut_numCnInCnGroups[2] > 0) { // Number of groups of 64 CNs for parallel processing // Ceil for values not divisible by 64 - M = (lut_numCnInCnGroups[2]*Z + 63)>>5; + M = (lut_numCnInCnGroups[2]*Z + 63)>>6; // Set the offset to each bit within a group in terms of 64 Byte - bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[2]*NR_LDPC_ZMAX)>>5; + bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[2]*NR_LDPC_ZMAX)>>6; // Set pointers to start of group 4 @@ -253,31 +243,30 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) fprintf(fd," for (int i=0;i<%d;i++) {\n",M); // Abs and sign of 64 CNs (first BN) // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; - fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[2]>>5)+lut_idxCnProcG5[j][0]); - // sgn = _mm512_sign_epi16(ones, zmm0); - fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n"); + fprintf(fd," sgn = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[2]>>6)+lut_idxCnProcG5[j][0]/2); + // sgn = _mm512_sign_epi8(ones, zmm0); // min = _mm512_abs_epi8(zmm0); - fprintf(fd," min = _mm512_abs_epi8(zmm0);\n"); + fprintf(fd," min = _mm512_abs_epi8(sgn);\n"); // Loop over BNs for (k=1; k<4; k++) { - fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[2]>>5)+lut_idxCnProcG5[j][k]); + fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[2]>>6)+lut_idxCnProcG5[j][k]/2); // min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n"); - // sgn = _mm512_sign_epi16(sgn, zmm0); - fprintf(fd," sgn = _mm512_sign_epi16(sgn, zmm0);\n"); + // sgn = _mm512_sign_epi8(sgn, zmm0); + fprintf(fd," sgn = _mm512_xor_si512(sgn, zmm0);\n"); } // Store result // min = _mm512_min_epu8(min, maxLLR); // 128 in epi8 is -127 fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n"); - // *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn); + // *p_cnProcBufResBit = _mm512_sign_epi8(min, sgn); // p_cnProcBufResBit++; - fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[2]>>5)+(j*bitOffsetInGroup)); + fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = conditional_negate(min, sgn,zeros);\n",(lut_startAddrCnGroups[2]>>6)+(j*bitOffsetInGroup)); fprintf(fd," }\n"); } } @@ -286,19 +275,19 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) // Process group with 6 BNs fprintf(fd,"//Process group with 6 BNs\n"); // Offset is 8*384/64 = 48 - const uint16_t lut_idxCnProcG6[6][5] = {{48,96,144,192,240}, {0,96,144,192,240}, - {0,48,144,192,240}, {0,48,96,192,240}, - {0,48,96,144,240}, {0,48,96,144,192}}; + const uint16_t lut_idxCnProcG6[6][5] = {{96,192,288,384,480}, {0,192,288,384,480}, + {0,96,288,384,480}, {0,96,192,384,480}, + {0,96,192,288,480}, {0,96,192,288,384}}; if (lut_numCnInCnGroups[3] > 0) { // Number of groups of 64 CNs for parallel processing // Ceil for values not divisible by 64 - M = (lut_numCnInCnGroups[3]*Z + 63)>>5; + M = (lut_numCnInCnGroups[3]*Z + 63)>>6; // Set the offset to each bit within a group in terms of 64 Byte - bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[3]*NR_LDPC_ZMAX)>>5; + bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[3]*NR_LDPC_ZMAX)>>6; // Set pointers to start of group 4 @@ -318,31 +307,30 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) fprintf(fd," for (int i=0;i<%d;i++) {\n",M); // Abs and sign of 64 CNs (first BN) // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; - fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[3]>>5)+lut_idxCnProcG6[j][0]); - // sgn = _mm512_sign_epi16(ones, zmm0); - fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n"); + fprintf(fd," sgn = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[3]>>6)+lut_idxCnProcG6[j][0]/2); + // sgn = _mm512_sign_epi8(ones, zmm0); // min = _mm512_abs_epi8(zmm0); - fprintf(fd," min = _mm512_abs_epi8(zmm0);\n"); + fprintf(fd," min = _mm512_abs_epi8(sgn);\n"); // Loop over BNs for (k=1; k<5; k++) { - fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[3]>>5)+lut_idxCnProcG6[j][k]); + fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[3]>>6)+lut_idxCnProcG6[j][k]/2); // min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n"); - // sgn = _mm512_sign_epi16(sgn, zmm0); - fprintf(fd," sgn = _mm512_sign_epi16(sgn, zmm0);\n"); + // sgn = _mm512_sign_epi8(sgn, zmm0); + fprintf(fd," sgn = _mm512_xor_si512(sgn, zmm0);\n"); } // Store result // min = _mm512_min_epu8(min, maxLLR); // 128 in epi8 is -127 fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n"); - // *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn); + // *p_cnProcBufResBit = _mm512_sign_epi8(min, sgn); // p_cnProcBufResBit++; - fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[3]>>5)+(j*bitOffsetInGroup)); + fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = conditional_negate(min, sgn,zeros);\n",(lut_startAddrCnGroups[3]>>6)+(j*bitOffsetInGroup)); fprintf(fd," }\n"); } } @@ -352,20 +340,20 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) // Process group with 7 BNs fprintf(fd,"//Process group with 7 BNs\n"); // Offset is 5*384/64 = 30 - const uint16_t lut_idxCnProcG7[7][6] = {{30,60,90,120,150,180}, {0,60,90,120,150,180}, - {0,30,90,120,150,180}, {0,30,60,120,150,180}, - {0,30,60,90,150,180}, {0,30,60,90,120,180}, - {0,30,60,90,120,150}}; + const uint16_t lut_idxCnProcG7[7][6] = {{60,120,180,240,300,360}, {0,120,180,240,300,360}, + {0,60,180,240,300,360}, {0,60,120,240,300,360}, + {0,60,120,180,300,360}, {0,60,120,180,240,360}, + {0,60,120,180,240,300}}; if (lut_numCnInCnGroups[4] > 0) { // Number of groups of 64 CNs for parallel processing // Ceil for values not divisible by 64 - M = (lut_numCnInCnGroups[4]*Z + 63)>>5; + M = (lut_numCnInCnGroups[4]*Z + 63)>>6; // Set the offset to each bit within a group in terms of 64 Byte - bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[4]*NR_LDPC_ZMAX)>>5; + bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[4]*NR_LDPC_ZMAX)>>6; // Set pointers to start of group 4 @@ -385,31 +373,30 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) fprintf(fd," for (int i=0;i<%d;i++) {\n",M); // Abs and sign of 64 CNs (first BN) // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; - fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[4]>>5)+lut_idxCnProcG7[j][0]); - // sgn = _mm512_sign_epi16(ones, zmm0); - fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n"); + fprintf(fd," sgn= ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[4]>>6)+lut_idxCnProcG7[j][0]/2); + // sgn = _mm512_sign_epi816(ones, zmm0); // min = _mm512_abs_epi8(zmm0); - fprintf(fd," min = _mm512_abs_epi8(zmm0);\n"); + fprintf(fd," min = _mm512_abs_epi8(sgn);\n"); // Loop over BNs for (k=1; k<6; k++) { - fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[4]>>5)+lut_idxCnProcG7[j][k]); + fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[4]>>6)+lut_idxCnProcG7[j][k]/2); // min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n"); - // sgn = _mm512_sign_epi16(sgn, zmm0); - fprintf(fd," sgn = _mm512_sign_epi16(sgn, zmm0);\n"); + // sgn = _mm512_sign_epi8(sgn, zmm0); + fprintf(fd," sgn = _mm512_xor_si512(sgn, zmm0);\n"); } // Store result // min = _mm512_min_epu8(min, maxLLR); // 128 in epi8 is -127 fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n"); - // *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn); + // *p_cnProcBufResBit = _mm512_sign_epi8(min, sgn); // p_cnProcBufResBit++; - fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[4]>>5)+(j*bitOffsetInGroup)); + fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = conditional_negate(min, sgn,zeros);\n",(lut_startAddrCnGroups[4]>>6)+(j*bitOffsetInGroup)); fprintf(fd," }\n"); } } @@ -419,10 +406,10 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) // Process group with 8 BNs fprintf(fd,"//Process group with 8 BNs\n"); // Offset is 2*384/64 = 12 - const uint8_t lut_idxCnProcG8[8][7] = {{12,24,36,48,56,72,84}, {0,24,36,48,56,72,84}, - {0,12,36,48,56,72,84}, {0,12,24,48,56,72,84}, - {0,12,24,36,56,72,84}, {0,12,24,36,48,72,84}, - {0,12,24,36,48,56,84}, {0,12,24,36,48,120,72}}; + const uint8_t lut_idxCnProcG8[8][7] = {{24,48,72,96,112,144,168}, {0,48,72,96,112,144,168}, + {0,24,72,96,112,144,168}, {0,24,48,96,112,144,168}, + {0,24,48,72,112,144,168}, {0,24,48,72,96,144,168}, + {0,24,48,72,96,112,168}, {0,24,48,72,96,112,144}}; @@ -430,10 +417,10 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) { // Number of groups of 64 CNs for parallel processing // Ceil for values not divisible by 64 - M = (lut_numCnInCnGroups[5]*Z + 63)>>5; + M = (lut_numCnInCnGroups[5]*Z + 63)>>6; // Set the offset to each bit within a group in terms of 64 Byte - bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[5]*NR_LDPC_ZMAX)>>5; + bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[5]*NR_LDPC_ZMAX)>>6; // Set pointers to start of group 4 @@ -453,31 +440,30 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) fprintf(fd," for (int i=0;i<%d;i++) {\n",M); // Abs and sign of 64 CNs (first BN) // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; - fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[5]>>5)+lut_idxCnProcG8[j][0]); - // sgn = _mm512_sign_epi16(ones, zmm0); - fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n"); + fprintf(fd," sgn = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[5]>>6)+lut_idxCnProcG8[j][0]/2); + // sgn = _mm512_sign_epi8(ones, zmm0); // min = _mm512_abs_epi8(zmm0); - fprintf(fd," min = _mm512_abs_epi8(zmm0);\n"); + fprintf(fd," min = _mm512_abs_epi8(sgn);\n"); // Loop over BNs for (k=1; k<7; k++) { - fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[5]>>5)+lut_idxCnProcG8[j][k]); + fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[5]>>6)+lut_idxCnProcG8[j][k]/2); // min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n"); - // sgn = _mm512_sign_epi16(sgn, zmm0); - fprintf(fd," sgn = _mm512_sign_epi16(sgn, zmm0);\n"); + // sgn = _mm512_sign_epi8(sgn, zmm0); + fprintf(fd," sgn = _mm512_xor_si512(sgn, zmm0);\n"); } // Store result // min = _mm512_min_epu8(min, maxLLR); // 128 in epi8 is -127 fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n"); - // *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn); + // *p_cnProcBufResBit = _mm512_sign_epi8(min, sgn); // p_cnProcBufResBit++; - fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[5]>>5)+(j*bitOffsetInGroup)); + fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = conditional_negate(min, sgn,zeros);\n",(lut_startAddrCnGroups[5]>>6)+(j*bitOffsetInGroup)); fprintf(fd," }\n"); } } @@ -486,11 +472,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) // Process group with 9 BNs fprintf(fd,"//Process group with 9 BNs\n"); // Offset is 2*384/64 = 12 - const uint8_t lut_idxCnProcG9[9][8] = {{12,24,36,48,60,72,84,96}, {0,24,36,48,60,72,84,96}, - {0,12,36,48,60,72,84,96}, {0,12,24,48,60,72,84,96}, - {0,12,24,36,60,72,84,96}, {0,12,24,36,48,72,84,96}, - {0,12,24,36,48,60,84,96}, {0,12,24,36,48,60,72,96}, - {0,12,24,36,48,60,72,84}}; + const uint8_t lut_idxCnProcG9[9][8] = {{24,48,72,96,120,144,168,192}, {0,48,72,96,120,144,168,192}, + {0,24,72,96,120,144,168,192}, {0,24,48,96,120,144,168,192}, + {0,24,48,72,120,144,168,192}, {0,24,48,72,96,144,168,192}, + {0,24,48,72,96,120,168,192}, {0,24,48,72,96,120,144,192}, + {0,24,48,72,96,120,144,168}}; @@ -499,10 +485,10 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) { // Number of groups of 64 CNs for parallel processing // Ceil for values not divisible by 64 - M = (lut_numCnInCnGroups[6]*Z + 63)>>5; + M = (lut_numCnInCnGroups[6]*Z + 63)>>6; // Set the offset to each bit within a group in terms of 64 Byte - bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[6]*NR_LDPC_ZMAX)>>5; + bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[6]*NR_LDPC_ZMAX)>>6; // Set pointers to start of group 9 @@ -522,31 +508,30 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) fprintf(fd," for (int i=0;i<%d;i++) {\n",M); // Abs and sign of 64 CNs (first BN) // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; - fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[6]>>5)+lut_idxCnProcG9[j][0]); - // sgn = _mm512_sign_epi16(ones, zmm0); - fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n"); + fprintf(fd," sgn = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[6]>>6)+lut_idxCnProcG9[j][0]/2); + // sgn = _mm512_sign_epi8(ones, zmm0); // min = _mm512_abs_epi8(zmm0); - fprintf(fd," min = _mm512_abs_epi8(zmm0);\n"); + fprintf(fd," min = _mm512_abs_epi8(sgn);\n"); // Loop over BNs for (k=1; k<8; k++) { - fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[6]>>5)+lut_idxCnProcG9[j][k]); + fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[6]>>6)+lut_idxCnProcG9[j][k]/2); // min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n"); - // sgn = _mm512_sign_epi16(sgn, zmm0); - fprintf(fd," sgn = _mm512_sign_epi16(sgn, zmm0);\n"); + // sgn = _mm512_sign_epi8(sgn, zmm0); + fprintf(fd," sgn = _mm512_xor_si512(sgn, zmm0);\n"); } // Store result // min = _mm512_min_epu8(min, maxLLR); // 128 in epi8 is -127 fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n"); - // *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn); + // *p_cnProcBufResBit = _mm512_sign_epi8(min, sgn); // p_cnProcBufResBit++; - fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[6]>>5)+(j*bitOffsetInGroup)); + fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = conditional_negate(min, sgn, zeros);\n",(lut_startAddrCnGroups[6]>>6)+(j*bitOffsetInGroup)); fprintf(fd," }\n"); } } @@ -555,11 +540,12 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) // Process group with 10 BNs fprintf(fd,"//Process group with 10 BNs\n"); // Offset is 1*384/64 = 6 - const uint8_t lut_idxCnProcG10[10][9] = {{6,12,18,24,30,36,42,48,54}, {0,12,18,24,30,36,42,48,54}, - {0,6,18,24,30,36,42,48,54}, {0,6,12,24,30,36,42,48,54}, - {0,6,12,18,30,36,42,48,54}, {0,6,12,18,24,36,42,48,54}, - {0,6,12,18,24,30,42,48,54}, {0,6,12,18,24,30,36,48,54}, - {0,6,12,18,24,30,36,42,54}, {0,6,12,36,24,30,36,42,48}}; + const uint8_t lut_idxCnProcG10[10][9] = {{12,24,36,48,60,72,84,96,108}, {0,24,36,48,60,72,84,96,108}, + {0,12,36,48,60,72,84,96,108}, {0,12,24,48,60,72,84,96,108}, + {0,12,24,36,60,72,84,96,108}, {0,12,24,36,48,72,84,96,108}, + {0,12,24,36,48,60,84,96,108}, {0,12,24,36,48,60,72,96,108}, + {0,12,24,36,48,60,72,84,108}, {0,12,24,36,48,60,72,84,96}}; + @@ -569,10 +555,10 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) { // Number of groups of 64 CNs for parallel processing // Ceil for values not divisible by 64 - M = (lut_numCnInCnGroups[7]*Z + 63)>>5; + M = (lut_numCnInCnGroups[7]*Z + 63)>>6; // Set the offset to each bit within a group in terms of 64 Byte - bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[7]*NR_LDPC_ZMAX)>>5; + bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[7]*NR_LDPC_ZMAX)>>6; // Set pointers to start of group 10 @@ -592,31 +578,30 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) fprintf(fd," for (int i=0;i<%d;i++) {\n",M); // Abs and sign of 64 CNs (first BN) // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; - fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[7]>>5)+lut_idxCnProcG10[j][0]); - // sgn = _mm512_sign_epi16(ones, zmm0); - fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n"); + fprintf(fd," sgn = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[7]>>6)+lut_idxCnProcG10[j][0]/2); + // sgn = _mm512_sign_epi8(ones, zmm0); // min = _mm512_abs_epi8(zmm0); - fprintf(fd," min = _mm512_abs_epi8(zmm0);\n"); + fprintf(fd," min = _mm512_abs_epi8(sgn);\n"); // Loop over BNs for (k=1; k<9; k++) { - fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[7]>>5)+lut_idxCnProcG10[j][k]); + fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[7]>>6)+lut_idxCnProcG10[j][k]/2); // min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n"); - // sgn = _mm512_sign_epi16(sgn, zmm0); - fprintf(fd," sgn = _mm512_sign_epi16(sgn, zmm0);\n"); + // sgn = _mm512_sign_epi8(sgn, zmm0); + fprintf(fd," sgn = _mm512_xor_si512(sgn, zmm0);\n"); } // Store result // min = _mm512_min_epu8(min, maxLLR); // 128 in epi8 is -127 fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n"); - // *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn); + // *p_cnProcBufResBit = _mm512_sign_epi8(min, sgn); // p_cnProcBufResBit++; - fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[7]>>5)+(j*bitOffsetInGroup)); + fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = conditional_negate(min,sgn,zeros);\n",(lut_startAddrCnGroups[7]>>6)+(j*bitOffsetInGroup)); fprintf(fd," }\n"); } } @@ -626,26 +611,26 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) // Process group with 19 BNs fprintf(fd,"//Process group with 19 BNs\n"); // Offset is 4*384/64 = 24 - const uint16_t lut_idxCnProcG19[19][18] = {{24,48,72,96,120,144,168,192,216,240,264,288,312,336,360,384,408,432}, {0,48,72,96,120,144,168,192,216,240,264,288,312,336,360,384,408,432}, - {0,24,72,96,120,144,168,192,216,240,264,288,312,336,360,384,408,432}, {0,24,48,96,120,144,168,192,216,240,264,288,312,336,360,384,408,432}, - {0,24,48,72,120,144,168,192,216,240,264,288,312,336,360,384,408,432}, {0,24,48,72,96,144,168,192,216,240,264,288,312,336,360,384,408,432}, - {0,24,48,72,96,120,168,192,216,240,264,288,312,336,360,384,408,432}, {0,24,48,72,96,120,144,192,216,240,264,288,312,336,360,384,408,432}, - {0,24,48,72,96,120,144,168,216,240,264,288,312,336,360,384,408,432}, {0,24,48,72,96,120,144,168,192,240,264,288,312,336,360,384,408,432}, - {0,24,48,72,96,120,144,168,192,216,264,288,312,336,360,384,408,432}, {0,24,48,72,96,120,144,168,192,216,240,288,312,336,360,384,408,432}, - {0,24,48,72,96,120,144,168,192,216,240,264,312,336,360,384,408,432}, {0,24,48,72,96,120,144,168,192,216,240,264,288,336,360,384,408,432}, - {0,24,48,72,96,120,144,168,192,216,240,264,288,312,360,384,408,432}, {0,24,48,72,96,120,144,168,192,216,240,264,288,312,336,384,408,432}, - {0,24,48,72,96,120,144,168,192,216,240,264,288,312,336,360,408,432}, {0,24,48,72,96,120,144,168,192,216,240,264,288,312,336,360,384,432}, - {0,24,48,72,96,120,144,168,192,216,240,264,288,312,336,360,384,408}}; + const uint16_t lut_idxCnProcG19[19][18] = {{48,96,144,192,240,288,336,384,432,480,528,576,624,672,720,768,816,864}, {0,96,144,192,240,288,336,384,432,480,528,576,624,672,720,768,816,864}, + {0,48,144,192,240,288,336,384,432,480,528,576,624,672,720,768,816,864}, {0,48,96,192,240,288,336,384,432,480,528,576,624,672,720,768,816,864}, + {0,48,96,144,240,288,336,384,432,480,528,576,624,672,720,768,816,864}, {0,48,96,144,192,288,336,384,432,480,528,576,624,672,720,768,816,864}, + {0,48,96,144,192,240,336,384,432,480,528,576,624,672,720,768,816,864}, {0,48,96,144,192,240,288,384,432,480,528,576,624,672,720,768,816,864}, + {0,48,96,144,192,240,288,336,432,480,528,576,624,672,720,768,816,864}, {0,48,96,144,192,240,288,336,384,480,528,576,624,672,720,768,816,864}, + {0,48,96,144,192,240,288,336,384,432,528,576,624,672,720,768,816,864}, {0,48,96,144,192,240,288,336,384,432,480,576,624,672,720,768,816,864}, + {0,48,96,144,192,240,288,336,384,432,480,528,624,672,720,768,816,864}, {0,48,96,144,192,240,288,336,384,432,480,528,576,672,720,768,816,864}, + {0,48,96,144,192,240,288,336,384,432,480,528,576,624,720,768,816,864}, {0,48,96,144,192,240,288,336,384,432,480,528,576,624,672,768,816,864}, + {0,48,96,144,192,240,288,336,384,432,480,528,576,624,672,720,816,864}, {0,48,96,144,192,240,288,336,384,432,480,528,576,624,672,720,768,864}, + {0,48,96,144,192,240,288,336,384,432,480,528,576,624,672,720,768,816}}; if (lut_numCnInCnGroups[8] > 0) { // Number of groups of 64 CNs for parallel processing // Ceil for values not divisible by 64 - M = (lut_numCnInCnGroups[8]*Z + 63)>>5; + M = (lut_numCnInCnGroups[8]*Z + 63)>>6; // Set the offset to each bit within a group in terms of 64 Byte - bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[8]*NR_LDPC_ZMAX)>>5; + bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[8]*NR_LDPC_ZMAX)>>6; // Set pointers to start of group 19 @@ -665,31 +650,30 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) fprintf(fd," for (int i=0;i<%d;i++) {\n",M); // Abs and sign of 64 CNs (first BN) // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; - fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[8]>>5)+lut_idxCnProcG19[j][0]); - // sgn = _mm512_sign_epi16(ones, zmm0); - fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n"); + fprintf(fd," sgn = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[8]>>6)+lut_idxCnProcG19[j][0]/2); + // sgn = _mm512_sign_epi8(ones, zmm0); // min = _mm512_abs_epi8(zmm0); - fprintf(fd," min = _mm512_abs_epi8(zmm0);\n"); + fprintf(fd," min = _mm512_abs_epi8(sgn);\n"); // Loop over BNs for (k=1; k<18; k++) { - fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[8]>>5)+lut_idxCnProcG19[j][k]); + fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[8]>>6)+lut_idxCnProcG19[j][k]/2); // min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n"); - // sgn = _mm512_sign_epi16(sgn, zmm0); - fprintf(fd," sgn = _mm512_sign_epi16(sgn, zmm0);\n"); + // sgn = _mm512_sign_epi8(sgn, zmm0); + fprintf(fd," sgn = _mm512_xor_si512(sgn, zmm0);\n"); } // Store result // min = _mm512_min_epu8(min, maxLLR); // 128 in epi8 is -127 fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n"); - // *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn); + // *p_cnProcBufResBit = _mm512_sign_epi8(min, sgn); // p_cnProcBufResBit++; - fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[8]>>5)+(j*bitOffsetInGroup)); + fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = conditional_negate(min, sgn,zeros);\n",(lut_startAddrCnGroups[8]>>6)+(j*bitOffsetInGroup)); fprintf(fd," }\n"); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_avx512/cnProc_gen_BG2_avx512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_avx512/cnProc_gen_BG2_avx512.c index 282cede4d6bbdcc2739ab092d6384bdd1fea5a23..405df50beb50c3bfd6b56011ad78658179549584 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_avx512/cnProc_gen_BG2_avx512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_avx512/cnProc_gen_BG2_avx512.c @@ -1,8 +1,7 @@ +#include <stdio.h> #include <stdint.h> -#include <immintrin.h> #include "../../nrLDPCdecoder_defs.h" -#include "../../nrLDPC_types.h" -#include "../../nrLDPC_bnProc.h" + void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R) { diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_avx512/cnProc_gen_avx512 b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_avx512/cnProc_gen_avx512 index 776516a4d72e8d73d51b9ed5a957c8915d912bea..c7dfcb1bc572432a8274f68b1ae7b4a54915d1f4 100755 Binary files a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_avx512/cnProc_gen_avx512 and b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_avx512/cnProc_gen_avx512 differ diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_avx512/main.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_avx512/main.c index be52b9698c5b53896cf19bc9d641d45fd9f41b5a..d9be56dde1b1d847d0fdb6a6fbeb35a193f12848 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_avx512/main.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_avx512/main.c @@ -1,9 +1,6 @@ #include <stdio.h> -#include <immintrin.h> -#include "../../nrLDPC_types.h" -#include "../../nrLDPC_init.h" -#include "../../nrLDPC_bnProc.h" +#include <stdint.h> #define NB_Z 51 void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t,int); void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t,int); diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z104_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z104_13_AVX512.c index a80f3fb2808070378f81300654891ec600d2399f..7090e8865959303f250c91c6c789f076fca65675 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z104_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z104_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z104_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<5;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<2;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<2;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<2;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<18;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<18;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<18;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<18;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<60;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<30;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<60;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<30;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<60;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<30;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<60;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<30;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<60;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<30;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<27;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<13;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<27;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<13;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<27;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<13;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<27;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<13;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<27;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<13;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<27;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<13;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<18;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<9;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<18;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<9;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<18;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<9;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<18;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<9;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<18;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<9;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<18;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<9;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<18;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<9;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z10_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z10_13_AVX512.c index b46f3f2e20e8b9bda983c8a345eb7a79aa3e6a25..a22fe64e88641f1804cbf206a2bd557b59a25374 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z10_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z10_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z10_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z112_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z112_13_AVX512.c index 14de356cc0092b1a2b324964bf6a3f324a3ff004..46af61917a6bbcfdfafbc662653d8e02bd1e6303 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z112_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z112_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z112_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<5;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<2;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<2;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<2;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<64;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<32;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<64;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<32;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<64;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<32;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<64;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<32;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<64;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<32;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<29;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<14;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<29;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<14;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<29;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<14;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<29;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<14;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<29;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<14;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<29;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<14;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<9;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<9;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<9;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<9;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<9;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<9;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<9;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z11_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z11_13_AVX512.c index aeba3ddd1f21f27d040f8b0f93f1319e4e1afbc3..6db79c76dbc0eb84a483b5c3f51f770d61642b03 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z11_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z11_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z11_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z120_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z120_13_AVX512.c index 71bc1db2bdedb579aad7664eb957e49f55c6aa34..0422bb74153cc3c21ead6a084b5dce1cde97e5c8 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z120_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z120_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z120_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<5;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<2;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<2;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<2;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<20;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<20;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<20;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<20;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<69;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<34;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<69;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<34;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<69;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<34;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<69;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<34;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<69;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<34;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<15;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<15;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<15;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<15;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<15;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<15;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<20;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<10;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<20;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<10;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<20;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<10;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<20;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<10;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<20;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<10;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<20;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<10;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<20;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<10;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z128_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z128_13_AVX512.c index 15edfb764789aa696d0535c526a5dbc568251ad3..2bb9a39a86490098ee0aca87c4b27073658fd5f0 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z128_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z128_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z128_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<5;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<2;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<2;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<2;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<73;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<36;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<73;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<36;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<73;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<36;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<73;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<36;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<73;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<36;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<33;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<16;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<33;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<16;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<33;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<16;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<33;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<16;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<33;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<16;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<33;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<16;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<10;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<10;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<10;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<10;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<10;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<10;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<10;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z12_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z12_13_AVX512.c index 09610a25e75e20d5fab0ef846dd75591a03d1eca..2b2a2e80eb720b824c58407bf9b61a181c12d44b 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z12_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z12_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z12_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z13_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z13_13_AVX512.c index 96e108ac27cfbcf4f5a288be51e733747118c070..af39ffbbe06b8dc902c5d6d73b0a2c9cfa1b0dee 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z13_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z13_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z13_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z144_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z144_13_AVX512.c index 449becf031c287beccd558527daa9bbda965d95c..b0caad9435ec89853a571b285eb7155f3068d992 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z144_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z144_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z144_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<6;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<3;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<3;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<3;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<24;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<24;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<24;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<24;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<82;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<41;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<82;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<41;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<82;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<41;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<82;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<41;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<82;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<41;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<37;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<18;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<37;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<18;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<37;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<18;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<37;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<18;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<37;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<18;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<37;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<18;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<24;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<12;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<24;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<12;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<24;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<12;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<24;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<12;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<24;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<12;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<24;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<12;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<24;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<12;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z14_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z14_13_AVX512.c index 0145645d66f79b405da0e72873614b71ee8fe07d..587dfe323ad50262ceff0f013d122a517aba69d2 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z14_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z14_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z14_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z15_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z15_13_AVX512.c index c301777ab6f0b46bb1d89f48d94b152b6c59591b..9fc63c424fc4844d840c0babdd9f79b5ff674e3f 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z15_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z15_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z15_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z160_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z160_13_AVX512.c index bd90abe1edea96bb61089f64c94423694031c66f..51a4cc7e1637552d72b6593bcc65e7a8ed79e6bc 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z160_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z160_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z160_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<6;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<3;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<3;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<3;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<26;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<13;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<26;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<13;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<26;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<13;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<26;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<13;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<91;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<45;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<91;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<45;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<91;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<45;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<91;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<45;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<91;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<45;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<41;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<20;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<41;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<20;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<41;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<20;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<41;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<20;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<41;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<20;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<41;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<20;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<26;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<13;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<26;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<13;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<26;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<13;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<26;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<13;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<26;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<13;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<26;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<13;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<26;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<13;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z16_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z16_13_AVX512.c index 4108cc350d686bf690e59972107c702a1a46dc33..eb0477a6f8609c277453aa0945e960948157f20c 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z16_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z16_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z16_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z176_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z176_13_AVX512.c index 2b8c34cfcd7a6c0dd180b3e5091527bd97efb619..50d4ea6cf9e418f23703b2375ad38922f1b707c0 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z176_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z176_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z176_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<7;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<3;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<3;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<3;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<29;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<14;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<29;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<14;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<29;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<14;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<29;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<14;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<100;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<50;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<100;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<50;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<100;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<50;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<100;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<50;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<100;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<50;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<45;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<22;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<45;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<22;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<45;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<22;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<45;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<22;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<45;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<22;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<45;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<22;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<29;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<14;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<29;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<14;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<29;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<14;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<29;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<14;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<29;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<14;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<29;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<14;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<29;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<14;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<12;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<12;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<12;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<12;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<12;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<12;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<12;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<12;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<12;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<12;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<12;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<12;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<12;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<12;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<12;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<12;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<12;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<23;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<23;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<23;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<23;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<23;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<23;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<23;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<23;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<23;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<23;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<23;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<23;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<23;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<23;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<23;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<23;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<23;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<23;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<23;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z18_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z18_13_AVX512.c index 79fd7bb3fcf1984662cae1fb0b553c86a4a04f14..23af4309f989769238e709824b42f957baa3dc89 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z18_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z18_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z18_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<12;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<12;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<12;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<12;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<12;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z192_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z192_13_AVX512.c index fa57e497b25c69eb8e1c0e51797880b9c449badf..f4d9331801d5952efe59c6b59d5348ffa743288b 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z192_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z192_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z192_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<7;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<3;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<3;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<3;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<15;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<15;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<15;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<15;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<109;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<54;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<109;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<54;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<109;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<54;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<109;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<54;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<109;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<54;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<49;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<24;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<49;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<24;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<49;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<24;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<49;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<24;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<49;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<24;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<49;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<24;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<15;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<15;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<15;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<15;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<15;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<15;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<15;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<25;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<25;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<25;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<25;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<25;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<25;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<25;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<25;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<25;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<25;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<25;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<25;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<25;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<25;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<25;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<25;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<25;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<25;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<25;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z208_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z208_13_AVX512.c index 6533824d0474ed38d50fea2ec670e335f9c5e497..fc2d4598d9b3b53469b15f91edbb8459161498a2 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z208_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z208_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z208_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<8;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<4;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<4;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<4;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<34;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<17;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<34;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<17;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<34;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<17;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<34;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<17;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<118;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<59;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<118;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<59;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<118;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<59;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<118;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<59;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<118;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<59;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<53;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<26;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<53;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<26;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<53;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<26;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<53;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<26;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<53;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<26;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<53;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<26;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<34;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<17;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<34;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<17;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<34;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<17;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<34;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<17;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<34;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<17;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<34;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<17;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<34;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<17;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<27;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<13;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<27;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<13;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<27;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<13;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<27;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<13;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<27;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<13;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<27;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<13;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<27;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<13;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<27;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<13;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<27;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<13;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<27;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<13;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<27;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<13;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<27;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<13;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<27;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<13;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<27;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<13;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<27;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<13;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<27;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<13;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<27;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<13;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<27;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<13;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<27;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<13;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z20_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z20_13_AVX512.c index b64ec809d7d1b4533e8bb2eaef1b98827293f282..c1812d6cdf201312898d2a425d48af27b22dbd06 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z20_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z20_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z20_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z224_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z224_13_AVX512.c index 96bd8f328b0e3b5b670fe2cb3467694993f78981..a3712dd09481377f071c4d9bf8902bec61310e7d 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z224_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z224_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z224_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<8;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<4;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<4;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<4;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<36;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<18;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<36;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<18;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<36;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<18;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<36;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<18;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<127;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<63;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<127;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<63;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<127;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<63;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<127;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<63;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<127;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<63;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<57;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<28;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<57;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<28;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<57;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<28;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<57;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<28;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<57;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<28;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<57;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<28;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<36;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<18;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<36;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<18;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<36;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<18;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<36;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<18;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<36;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<18;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<36;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<18;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<36;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<18;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<29;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<14;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<29;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<14;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<29;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<14;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<29;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<14;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<29;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<14;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<29;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<14;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<29;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<14;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<29;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<14;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<29;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<14;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<29;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<14;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<29;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<14;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<29;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<14;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<29;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<14;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<29;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<14;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<29;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<14;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<29;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<14;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<29;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<14;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<29;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<14;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<29;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<14;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z22_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z22_13_AVX512.c index 071aaa7acc3fd1e72dff480ad5bdd42cb5e0f0c3..e57a6940bbb7478a18c7da4680e1818b2048ea0a 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z22_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z22_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z22_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z240_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z240_13_AVX512.c index 53a9cdbdf42d1994cb28beed766ad85f3cc13b59..547fe814214c041b7c4c1bd9e16b4769d6201c51 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z240_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z240_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z240_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<9;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<4;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<4;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<4;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<39;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<19;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<39;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<19;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<39;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<19;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<39;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<19;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<136;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<68;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<136;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<68;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<136;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<68;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<136;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<68;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<136;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<68;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<61;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<30;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<61;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<30;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<61;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<30;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<61;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<30;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<61;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<30;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<61;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<30;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<39;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<19;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<39;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<19;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<39;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<19;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<39;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<19;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<39;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<19;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<39;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<19;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<39;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<19;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<15;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<15;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<15;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<15;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<15;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<15;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<15;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<15;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<15;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<15;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<15;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<15;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<15;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<15;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<15;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<15;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<15;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<15;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<15;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z24_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z24_13_AVX512.c index a34658b5ec14c30d84928a229506351216e9aa91..a82c63869b6ec0e0f8afc4ec7672b73e434d4cd8 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z24_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z24_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z24_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<2;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z256_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z256_13_AVX512.c index c7a646abaeef0c31a778b7cf7fbcb4920b059242..b52b8b6d808d4d33f156c7430476e96f0b7503be 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z256_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z256_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z256_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<9;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<4;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<4;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<4;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<41;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<20;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<41;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<20;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<41;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<20;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<41;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<20;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<145;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<72;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<145;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<72;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<145;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<72;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<145;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<72;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<145;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<72;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<65;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<32;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<65;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<32;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<65;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<32;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<65;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<32;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<65;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<32;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<65;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<32;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<41;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<20;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<41;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<20;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<41;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<20;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<41;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<20;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<41;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<20;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<41;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<20;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<41;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<20;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<33;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<16;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<33;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<16;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<33;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<16;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<33;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<16;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<33;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<16;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<33;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<16;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<33;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<16;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<33;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<16;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<33;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<16;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<33;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<16;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<33;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<16;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<33;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<16;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<33;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<16;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<33;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<16;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<33;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<16;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<33;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<16;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<33;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<16;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<33;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<16;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<33;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<16;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z26_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z26_13_AVX512.c index bd776e8af1b51b822999b20d2846cefff94f14ee..3a3855364931dfe79f879b77f7ca09f6eda1dd55 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z26_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z26_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z26_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<3;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<3;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<3;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<3;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<3;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<3;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<3;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z288_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z288_13_AVX512.c index 90553112b04a0bface12c65eebf69db5ec466b23..021d7ebc0678cf6503273892ce4e873b50445fbe 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z288_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z288_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z288_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<10;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<5;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<5;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<5;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<46;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<23;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<46;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<23;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<46;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<23;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<46;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<23;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<163;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<81;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<163;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<81;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<163;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<81;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<163;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<81;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<163;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<81;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<73;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<36;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<73;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<36;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<73;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<36;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<73;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<36;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<73;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<36;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<73;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<36;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<46;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<23;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<46;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<23;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<46;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<23;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<46;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<23;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<46;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<23;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<46;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<23;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<46;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<23;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<37;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<18;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<37;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<18;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<37;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<18;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<37;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<18;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<37;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<18;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<37;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<18;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<37;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<18;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<37;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<18;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<37;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<18;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<37;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<18;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<37;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<18;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<37;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<18;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<37;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<18;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<37;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<18;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<37;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<18;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<37;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<18;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<37;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<18;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<37;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<18;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<37;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<18;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z28_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z28_13_AVX512.c index aca2ba32ebe6d035fb75d33703b16e84ea9da105..f9a3dc4d788200a3e8b44ce390ad6e428de11784 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z28_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z28_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z28_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<3;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<3;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<3;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<3;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<3;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<3;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<3;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z2_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z2_13_AVX512.c index 92237b43f2bee7b0e10df696d7e31cfd24043a13..28cd588dd5d0d256d77593b34326ad40d56edf63 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z2_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z2_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z2_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z30_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z30_13_AVX512.c index b08797b2c323c261e73c1b0bd456bd611c731c50..3577d0408c06e82100a17d660f50cd793e75cd5a 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z30_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z30_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z30_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<18;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<18;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<18;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<18;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<18;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<3;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<3;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<3;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<3;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<3;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<3;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<3;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z320_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z320_13_AVX512.c index 09f4b52d2fde48a1c5e5f1927658544645a2fbe9..481d3cfb57aa44bf840c56df5bf872fd5c5178ec 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z320_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z320_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z320_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<11;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<5;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<11;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<5;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<11;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<5;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<51;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<25;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<51;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<25;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<51;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<25;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<51;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<25;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<181;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<90;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<181;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<90;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<181;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<90;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<181;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<90;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<181;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<90;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<81;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<40;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<81;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<40;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<81;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<40;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<81;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<40;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<81;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<40;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<81;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<40;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<51;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<25;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<51;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<25;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<51;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<25;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<51;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<25;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<51;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<25;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<51;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<25;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<51;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<25;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<41;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<20;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<41;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<20;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<41;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<20;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<41;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<20;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<41;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<20;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<41;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<20;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<41;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<20;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<41;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<20;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<41;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<20;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<41;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<20;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<41;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<20;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<41;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<20;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<41;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<20;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<41;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<20;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<41;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<20;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<41;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<20;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<41;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<20;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<41;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<20;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<41;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<20;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z32_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z32_13_AVX512.c index a09cfac995df2eb77026446dd4b6a918bc6c7238..948ef9011e64c776bcf4e1233f8f62e3a64b47b3 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z32_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z32_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z32_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<3;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<3;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<3;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<3;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<3;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<3;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<3;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z352_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z352_13_AVX512.c index d9e160a715a65ec499f51334281158cc6b4b7af2..0e1403eab1f071e305b5eea974eab10d0cfb8b88 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z352_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z352_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z352_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<12;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<6;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<12;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<6;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<12;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<6;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<56;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<28;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<56;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<28;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<56;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<28;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<56;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<28;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<199;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<99;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<199;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<99;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<199;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<99;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<199;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<99;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<199;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<99;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<89;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<44;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<89;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<44;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<89;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<44;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<89;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<44;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<89;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<44;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<89;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<44;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<56;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<28;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<56;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<28;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<56;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<28;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<56;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<28;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<56;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<28;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<56;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<28;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<56;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<28;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<23;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<23;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<23;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<23;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<23;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<23;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<23;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<23;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<23;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<23;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<23;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<23;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<23;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<23;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<23;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<23;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<23;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<12;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<12;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<12;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<12;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<12;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<12;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<12;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<12;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<12;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<12;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<45;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<22;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<45;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<22;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<45;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<22;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<45;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<22;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<45;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<22;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<45;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<22;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<45;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<22;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<45;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<22;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<45;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<22;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<45;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<22;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<45;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<22;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<45;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<22;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<45;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<22;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<45;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<22;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<45;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<22;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<45;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<22;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<45;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<22;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<45;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<22;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<45;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<22;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z36_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z36_13_AVX512.c index 25b58a5900c44a54acdd62cf7cb846a4f0025475..47a3ac7eb51e5fc726fb6602e2494e4b877d0132 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z36_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z36_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z36_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<3;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<22;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<22;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<22;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<22;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<22;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<11;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<3;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<3;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<3;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<3;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<3;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<3;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<3;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z384_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z384_13_AVX512.c index 90743f4d94962ade10687cd74a1899ad85a14aa4..e28d5adbb49f9f86a9b33e3ad522d1255c6aa497 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z384_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z384_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z384_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<13;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<6;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<13;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<6;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<13;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<6;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<61;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<30;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<61;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<30;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<61;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<30;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<61;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<30;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<217;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<108;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<217;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<108;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<217;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<108;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<217;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<108;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<217;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<108;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<97;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<48;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<97;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<48;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<97;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<48;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<97;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<48;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<97;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<48;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<97;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<48;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<61;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<30;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<61;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<30;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<61;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<30;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<61;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<30;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<61;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<30;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<61;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<30;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<61;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<30;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<25;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<25;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<25;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<25;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<25;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<25;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<25;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<25;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<25;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<25;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<25;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<25;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<25;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<25;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<25;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<25;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<25;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<49;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<24;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<49;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<24;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<49;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<24;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<49;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<24;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<49;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<24;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<49;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<24;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<49;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<24;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<49;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<24;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<49;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<24;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<49;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<24;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<49;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<24;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<49;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<24;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<49;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<24;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<49;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<24;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<49;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<24;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<49;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<24;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<49;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<24;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<49;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<24;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<49;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<24;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z3_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z3_13_AVX512.c index 68d316935cbf96cde5c5a3c912c56f4b625ce82d..0d61a00fd6415d85291f04936bc4603131444899 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z3_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z3_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z3_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z40_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z40_13_AVX512.c index 39bd657832e1d27f5f91c0e150177b21081314bb..8d64735af73bcbe6d09aa31dc8863d2fb281b424 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z40_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z40_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z40_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<3;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<24;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<24;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<24;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<24;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<24;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<12;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<4;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<4;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<4;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<4;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<4;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<4;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<4;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z44_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z44_13_AVX512.c index aa1e6434977ea2527ff787b2b377fe1d66ed990f..9b7179e26c19a1bcc48fc561507285da8a550849 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z44_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z44_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z44_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<3;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<26;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<13;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<26;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<13;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<26;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<13;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<26;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<13;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<26;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<13;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<12;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<12;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<12;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<12;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<12;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<12;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<4;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<4;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<4;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<4;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<4;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<4;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<4;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z48_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z48_13_AVX512.c index 5638fe34d42d7961b2efac779adf34cf2a7d7b63..d3483cc5db7da7508787578fcae8b8d855e5b7ff 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z48_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z48_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z48_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<3;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<28;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<14;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<28;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<14;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<28;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<14;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<28;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<14;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<28;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<14;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<4;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<4;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<4;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<4;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<4;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<4;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<4;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<7;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z4_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z4_13_AVX512.c index 268c1feda5b82e685d26f6a97452e6edc299efa8..e3a32db4954982dac5f2621c7dd6e15ea23a1ca3 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z4_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z4_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z4_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z52_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z52_13_AVX512.c index bbb23c68e1574537be9e017a66c36813278858bd..e23a9003fcdd2516aa60e9ad69cb6aad0ea2a295 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z52_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z52_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z52_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<3;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<15;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<15;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<15;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<15;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<31;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<15;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<5;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<5;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<5;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<5;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<5;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<5;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<5;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z56_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z56_13_AVX512.c index 697578fb914f5d509112d9a3aa6a2cf78b0d472e..72d3b1577128e3ab81f3982f443ceab3d5880c4e 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z56_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z56_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z56_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<3;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<33;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<16;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<33;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<16;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<33;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<16;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<33;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<16;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<33;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<16;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<15;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<5;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<5;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<5;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<5;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<5;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<5;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<5;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<8;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z5_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z5_13_AVX512.c index 9ecfe8253edd6999509477b4fdf85874538d706b..2e5adf5f3458084cb1d244a84aa85cbcbc162c7e 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z5_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z5_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z5_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z60_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z60_13_AVX512.c index 7388cfc93c245e97b8c0d30d0ffcfce2e3dd2b54..df5670b10ffd30fcb737662c25cdd58b67fbcc05 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z60_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z60_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z60_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<3;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<35;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<17;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<35;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<17;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<35;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<17;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<35;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<17;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<35;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<17;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<16;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<5;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<5;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<5;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<5;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<5;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<5;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<5;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z64_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z64_13_AVX512.c index 344c581d07c1885d16c482e3426adf83322bc292..c3addcbd4893c737f4bfc14c2e891dcc9e64e4c7 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z64_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z64_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z64_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<3;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<37;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<18;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<37;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<18;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<37;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<18;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<37;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<18;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<37;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<18;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<17;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<8;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<5;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<5;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<5;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<5;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<5;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<5;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<11;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<5;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<9;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<4;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z6_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z6_13_AVX512.c index f36090f3c5d690361629378de638c77d2381626c..1fb8a310abb8994ef052def1057100ef40379fb0 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z6_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z6_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z6_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z72_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z72_13_AVX512.c index 7cd843079af45be67ff9b34daf9f3414872b45ae..5f6e1e3946497a635336f58194bcbd6b4781fd38 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z72_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z72_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z72_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<4;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<2;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<2;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<2;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<6;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<42;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<21;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<42;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<21;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<42;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<21;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<42;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<21;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<42;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<21;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<19;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<9;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<6;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<6;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<6;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<6;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<6;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<6;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<13;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<6;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<4;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<10;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<5;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z7_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z7_13_AVX512.c index b6b2217da5eca3275ad38ad8afd89ce417695eec..21f46236470242327e108962a76a45ab3d47e575 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z7_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z7_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z7_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<1;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<5;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<2;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<3;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<1;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2592+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1296+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2616+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1308+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2640+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1320+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2664+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1332+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2688+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1344+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2712+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1356+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1284+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2736+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1368+i] = conditional_negate(min, sgn, zeros); } //Process group with 10 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1386+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2760+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1380+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2772+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1386+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2784+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1392+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2796+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1398+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2808+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1404+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2820+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1410+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2832+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1416+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2844+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1422+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2778+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2814+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1434+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2856+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1428+i] = conditional_negate(min,sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2760+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2766+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1380+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1386+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2772+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1392+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1398+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2784+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1404+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2790+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1410+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2796+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1416+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2802+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1422+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2808+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1428+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2868+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1434+i] = conditional_negate(min,sgn,zeros); } //Process group with 19 BNs - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1464+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2880+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1440+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2928+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1464+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2976+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1488+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3024+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1512+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3072+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1536+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3120+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1560+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3168+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1584+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1608+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3264+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1632+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3312+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1656+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3360+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1680+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3408+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1704+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3456+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1728+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3504+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1752+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3552+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1776+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3600+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1800+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3648+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1824+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3312+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1872+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3696+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1848+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<2;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2880+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2904+i]; + for (int i=0;i<1;i++) { + sgn = ((__m512i*)cnProcBuf)[1440+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1464+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2928+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1488+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2952+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1512+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2976+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1536+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3000+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1560+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3024+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1584+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3048+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1608+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3072+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1632+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3096+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1656+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3120+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1680+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3144+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1704+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3168+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1728+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3192+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1752+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3216+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1776+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3240+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1800+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3264+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1824+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[3288+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1848+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[3744+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1872+i] = conditional_negate(min, sgn,zeros); } } diff --git a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z80_13_AVX512.c b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z80_13_AVX512.c index 4270e81f947265ff6161bfedf0acec0afc112af4..35ec155a7b718b7f96f50cb2a5f86cb7e3505aee 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z80_13_AVX512.c +++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z80_13_AVX512.c @@ -1,2340 +1,2261 @@ #include <stdint.h> #include <immintrin.h> -__m512i _mm512_sign_epi16(__m512i a, __m512i b){ -b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); -b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); - a = _mm512_mullo_epi16(a, b); -return a; -} +#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a) void nrLDPC_cnProc_BG1_Z80_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) { //Process group with 3 BNs - __m512i zmm0, min, sgn,ones,maxLLR; - ones = _mm512_set1_epi8((char)1); + __m512i zmm0, min, sgn,zeros,maxLLR; + zeros = _mm512_setzero_si512(); maxLLR = _mm512_set1_epi8((char)127); - for (int i=0;i<4;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[6+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<2;i+=2) { + sgn = ((__m512i*)cnProcBuf)[6+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[0+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[7+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[0+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[7+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<2;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[12+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[12+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[6+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[13+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[13+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[7+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<4;i+=2) { - zmm0 = ((__m512i*)cnProcBuf)[0+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + for (int i=0;i<2;i+=2) { + sgn = ((__m512i*)cnProcBuf)[0+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[6+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[24+i] = _mm512_sign_epi16(min, sgn); - zmm0 = ((__m512i*)cnProcBuf)[1+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); + ((__m512i*)cnProcBufRes)[12+i] = conditional_negate(min, sgn,zeros); + sgn = ((__m512i*)cnProcBuf)[1+i]; + min = _mm512_abs_epi8(sgn); zmm0 = ((__m512i*)cnProcBuf)[7+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[25+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[13+i] = conditional_negate(min, sgn,zeros); } //Process group with 4 BNs - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[66+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[48+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[36+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[18+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[96+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[48+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[126+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[108+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[156+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[78+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[36+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[66+i]; + for (int i=0;i<7;i++) { + sgn = ((__m512i*)cnProcBuf)[18+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[48+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[96+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[78+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[216+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[108+i] = conditional_negate(min, sgn,zeros); } //Process group with 5 BNs - for (int i=0;i<46;i++) { - zmm0 = ((__m512i*)cnProcBuf)[384+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<23;i++) { + sgn = ((__m512i*)cnProcBuf)[246+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[276+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[138+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<46;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + for (int i=0;i<23;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[492+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[246+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<46;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<23;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[708+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[354+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<46;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<23;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[708+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[570+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[924+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[462+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<46;i++) { - zmm0 = ((__m512i*)cnProcBuf)[276+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[384+i]; + for (int i=0;i<23;i++) { + sgn = ((__m512i*)cnProcBuf)[138+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[246+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[492+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[354+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[600+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[462+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1140+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[570+i] = conditional_negate(min, sgn,zeros); } //Process group with 6 BNs - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[726+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1356+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[678+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1452+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[726+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1548+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[774+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1644+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[822+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1596+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[918+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1740+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[870+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<21;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1356+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1404+i]; + for (int i=0;i<10;i++) { + sgn = ((__m512i*)cnProcBuf)[678+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[726+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1452+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[774+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1500+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[822+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1548+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[870+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1836+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[918+i] = conditional_negate(min, sgn,zeros); } //Process group with 7 BNs - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<7;i++) { + sgn= ((__m512i*)cnProcBuf)[996+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1932+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[966+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + for (int i=0;i<7;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[1992+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[996+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<7;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2052+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1026+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<7;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2112+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1056+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<7;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2172+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1086+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<7;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2112+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1146+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2232+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1116+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<14;i++) { - zmm0 = ((__m512i*)cnProcBuf)[1932+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1962+i]; + for (int i=0;i<7;i++) { + sgn= ((__m512i*)cnProcBuf)[966+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[996+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[1992+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1026+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2022+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1056+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2052+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1086+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2082+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1116+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2292+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1146+i] = conditional_negate(min, sgn,zeros); } //Process group with 8 BNs - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1188+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2352+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1176+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2376+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1188+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2400+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1200+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2424+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1212+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2448+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1224+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2472+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1236+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2408+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2436+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1260+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2496+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1248+i] = conditional_negate(min, sgn,zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2352+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2364+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1176+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1188+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2376+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1200+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2388+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1212+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2400+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1224+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2472+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1232+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2424+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1248+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2520+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1260+i] = conditional_negate(min, sgn,zeros); } //Process group with 9 BNs - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2556+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1284+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2544+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1272+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<6;i++) { - zmm0 = ((__m512i*)cnProcBuf)[2544+i]; - sgn = _mm512_sign_epi16(ones, zmm0); - min = _mm512_abs_epi8(zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2568+i]; + for (int i=0;i<3;i++) { + sgn = ((__m512i*)cnProcBuf)[1272+i]; + min = _mm512_abs_epi8(sgn); + zmm0 = ((__m512i*)cnProcBuf)[1296+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2580+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1308+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2592+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1320+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2604+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1332+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2616+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1344+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2628+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1356+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); - zmm0 = ((__m512i*)cnProcBuf)[2640+i]; + sgn = _mm512_xor_si512(sgn, zmm0); + zmm0 = ((__m512i*)cnProcBuf)[1368+i]; min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); - sgn = _mm512_sign_epi16(sgn, zmm0); + sgn = _mm512_xor_si512(sgn, zmm0); min = _mm512_min_epu8(min, maxLLR); - ((__m512i*)cnProcBufRes)[2568+i] = _mm512_sign_epi16(min, sgn); + ((__m512i*)cnProcBufRes)[1284+i] = conditional_negate(min, sgn, zeros); } - for (int i=0;i<