Commit 0c499324 authored by Quency Lin's avatar Quency Lin

fix: CI failure due to SIMD misalignment

parent 797f8cec
...@@ -950,30 +950,30 @@ void nr_ulsch_qam16_qam16(c16_t *stream0_in, ...@@ -950,30 +950,30 @@ void nr_ulsch_qam16_qam16(c16_t *stream0_in,
simde__m128i ch_mag_over_2; simde__m128i ch_mag_over_2;
simde__m128i ch_mag_9_over_10; simde__m128i ch_mag_9_over_10;
simde__m128i xmm0 __attribute__((aligned(16))); simde__m128i xmm0;
simde__m128i xmm1 __attribute__((aligned(16))); simde__m128i xmm1;
simde__m128i xmm2 __attribute__((aligned(16))); simde__m128i xmm2;
simde__m128i xmm3 __attribute__((aligned(16))); simde__m128i xmm3;
simde__m128i xmm4 __attribute__((aligned(16))); simde__m128i xmm4;
simde__m128i xmm5 __attribute__((aligned(16))); simde__m128i xmm5;
simde__m128i xmm6 __attribute__((aligned(16))); simde__m128i xmm6;
simde__m128i xmm7 __attribute__((aligned(16))); simde__m128i xmm7;
simde__m128i rho_rpi __attribute__((aligned(16))); simde__m128i rho_rpi;
simde__m128i rho_rmi __attribute__((aligned(16))); simde__m128i rho_rmi;
simde__m128i rho_rs[8] __attribute__((aligned(16))); simde__m128i rho_rs[8];
simde__m128i psi_rs[16] __attribute__((aligned(16))); simde__m128i psi_rs[16];
simde__m128i psi_is[16] __attribute__((aligned(16))); simde__m128i psi_is[16];
simde__m128i a_rs[16] __attribute__((aligned(16))); simde__m128i a_rs[16];
simde__m128i a_is[16] __attribute__((aligned(16))); simde__m128i a_is[16];
simde__m128i psi_as[16] __attribute__((aligned(16))); simde__m128i psi_as[16];
simde__m128i a_sqs[16] __attribute__((aligned(16))); simde__m128i a_sqs[16];
simde__m128i y0_s[8] __attribute__((aligned(16))); simde__m128i y0_s[8];
simde__m128i y0r __attribute__((aligned(16))); simde__m128i y0r;
simde__m128i y0i __attribute__((aligned(16))); simde__m128i y0i;
simde__m128i y1r __attribute__((aligned(16))); simde__m128i y1r;
simde__m128i y1i __attribute__((aligned(16))); simde__m128i y1i;
// In one iteration, we deal with 8 REs // In one iteration, we deal with 8 REs
for (int i = 0; i < length >> 2; i += 2) { for (int i = 0; i < length >> 2; i += 2) {
...@@ -1070,7 +1070,7 @@ void nr_ulsch_qam16_qam16(c16_t *stream0_in, ...@@ -1070,7 +1070,7 @@ void nr_ulsch_qam16_qam16(c16_t *stream0_in,
ch_mag_9_over_10 = simde_mm_slli_epi16(ch_mag_9_over_10, 2); ch_mag_9_over_10 = simde_mm_slli_epi16(ch_mag_9_over_10, 2);
/// Compute bit metrics (lambda) /// Compute bit metrics (lambda)
simde__m128i bit_mets[16] __attribute__((aligned(16))); simde__m128i bit_mets[16];
for(int j=0; j<8; j+=4){ for(int j=0; j<8; j+=4){
bit_mets[j+0] = simde_mm_subs_epi16(psi_as[j+0], a_sqs[j+0]); bit_mets[j+0] = simde_mm_subs_epi16(psi_as[j+0], a_sqs[j+0]);
bit_mets[j+0] = simde_mm_adds_epi16(bit_mets[j+0], y0_s[j+0]); bit_mets[j+0] = simde_mm_adds_epi16(bit_mets[j+0], y0_s[j+0]);
...@@ -1163,28 +1163,28 @@ void nr_ulsch_qam16_qam16(c16_t *stream0_in, ...@@ -1163,28 +1163,28 @@ void nr_ulsch_qam16_qam16(c16_t *stream0_in,
simde__m256i ch_mag_over_2; simde__m256i ch_mag_over_2;
simde__m256i ch_mag_9_over_10; simde__m256i ch_mag_9_over_10;
simde__m256i xmm2 __attribute__((aligned(16))); simde__m256i xmm2;
simde__m256i xmm3 __attribute__((aligned(16))); simde__m256i xmm3;
simde__m256i xmm4 __attribute__((aligned(16))); simde__m256i xmm4;
simde__m256i xmm5 __attribute__((aligned(16))); simde__m256i xmm5;
simde__m256i xmm6 __attribute__((aligned(16))); simde__m256i xmm6;
simde__m256i xmm7 __attribute__((aligned(16))); simde__m256i xmm7;
simde__m256i rho_rpi __attribute__((aligned(16))); simde__m256i rho_rpi;
simde__m256i rho_rmi __attribute__((aligned(16))); simde__m256i rho_rmi;
simde__m256i rho_rs[8] __attribute__((aligned(16))); simde__m256i rho_rs[8];
simde__m256i psi_rs[16] __attribute__((aligned(16))); simde__m256i psi_rs[16];
simde__m256i psi_is[16] __attribute__((aligned(16))); simde__m256i psi_is[16];
simde__m256i a_rs[16] __attribute__((aligned(16))); simde__m256i a_rs[16];
simde__m256i a_is[16] __attribute__((aligned(16))); simde__m256i a_is[16];
simde__m256i psi_as[16] __attribute__((aligned(16))); simde__m256i psi_as[16];
simde__m256i a_sqs[16] __attribute__((aligned(16))); simde__m256i a_sqs[16];
simde__m256i y0_s[8] __attribute__((aligned(16))); simde__m256i y0_s[8];
simde__m256i y0r __attribute__((aligned(16))); simde__m256i y0r;
simde__m256i y0i __attribute__((aligned(16))); simde__m256i y0i;
simde__m256i y1r __attribute__((aligned(16))); simde__m256i y1r;
simde__m256i y1i __attribute__((aligned(16))); simde__m256i y1i;
// In one iteration, we deal with 8 REs // In one iteration, we deal with 8 REs
for (int i = 0; i < length >> 3; i += 2) { for (int i = 0; i < length >> 3; i += 2) {
...@@ -1282,7 +1282,7 @@ void nr_ulsch_qam16_qam16(c16_t *stream0_in, ...@@ -1282,7 +1282,7 @@ void nr_ulsch_qam16_qam16(c16_t *stream0_in,
/// Compute bit metrics (lambda) /// Compute bit metrics (lambda)
simde__m256i bit_mets[16] __attribute__((aligned(16))); simde__m256i bit_mets[16];
for(int j=0; j<8; j+=4){ for(int j=0; j<8; j+=4){
bit_mets[j+0] = simde_mm256_subs_epi16(psi_as[j+0], a_sqs[j+0]); bit_mets[j+0] = simde_mm256_subs_epi16(psi_as[j+0], a_sqs[j+0]);
bit_mets[j+0] = simde_mm256_adds_epi16(bit_mets[j+0], y0_s[j+0]); bit_mets[j+0] = simde_mm256_adds_epi16(bit_mets[j+0], y0_s[j+0]);
...@@ -1454,7 +1454,7 @@ void nr_ulsch_qam64_qam64(c16_t *stream0_in, ...@@ -1454,7 +1454,7 @@ void nr_ulsch_qam64_qam64(c16_t *stream0_in,
simde__m128i rho_rmi = simde_mm_subs_epi16(xmm2, xmm3); // rho* = Re(rho) - Im(rho) simde__m128i rho_rmi = simde_mm_subs_epi16(xmm2, xmm3); // rho* = Re(rho) - Im(rho)
// Compute the different rhos // Compute the different rhos
simde__m128i rho_rs[32] __attribute__((aligned(16))); simde__m128i rho_rs[32];
rho_rs[27] = simde_mm_mulhi_epi16(rho_rpi, ONE_OVER_SQRT_42); rho_rs[27] = simde_mm_mulhi_epi16(rho_rpi, ONE_OVER_SQRT_42);
rho_rs[28] = simde_mm_mulhi_epi16(rho_rmi, ONE_OVER_SQRT_42); rho_rs[28] = simde_mm_mulhi_epi16(rho_rmi, ONE_OVER_SQRT_42);
rho_rs[18] = simde_mm_mulhi_epi16(rho_rpi, THREE_OVER_SQRT_42); rho_rs[18] = simde_mm_mulhi_epi16(rho_rpi, THREE_OVER_SQRT_42);
...@@ -1518,14 +1518,14 @@ void nr_ulsch_qam64_qam64(c16_t *stream0_in, ...@@ -1518,14 +1518,14 @@ void nr_ulsch_qam64_qam64(c16_t *stream0_in,
xmm0 = simde_mm_set1_epi16(0); // ZERO for abs_pi16 xmm0 = simde_mm_set1_epi16(0); // ZERO for abs_pi16
xmm2 = simde_mm_subs_epi16(rho_rs[0], y1r); xmm2 = simde_mm_subs_epi16(rho_rs[0], y1r);
simde__m128i psi_r_s[64] __attribute__((aligned(16))); simde__m128i psi_r_s[64];
for(int j=0; j<32; j++) // psi_r_s[0~31], rho_rs[0~31] for(int j=0; j<32; j++) // psi_r_s[0~31], rho_rs[0~31]
psi_r_s[j] = simde_mm_abs_epi16(simde_mm_subs_epi16(rho_rs[j], y1r)); psi_r_s[j] = simde_mm_abs_epi16(simde_mm_subs_epi16(rho_rs[j], y1r));
for(int j=32; j<64; j++) // psi_r_s[32~64], rho_rs[31~0] for(int j=32; j<64; j++) // psi_r_s[32~64], rho_rs[31~0]
psi_r_s[j] = simde_mm_abs_epi16(simde_mm_adds_epi16(rho_rs[63 - j], y1r)); psi_r_s[j] = simde_mm_abs_epi16(simde_mm_adds_epi16(rho_rs[63 - j], y1r));
// simde__m128i psi_i calculation from rho_rpi or rho_rmi // simde__m128i psi_i calculation from rho_rpi or rho_rmi
simde__m128i psi_i_s[64] __attribute__((aligned(16))); simde__m128i psi_i_s[64];
const uint8_t rho_rs_index[32] = {7,15,23,31,24,16,8,0,6,14,22,30,25,17,9,1,5,13,21,29,26,18,10,2,4,12,20,28,27,19,11,3}; const uint8_t rho_rs_index[32] = {7,15,23,31,24,16,8,0,6,14,22,30,25,17,9,1,5,13,21,29,26,18,10,2,4,12,20,28,27,19,11,3};
for(int k=0; k<32; k+=8){ // psi_i_s[0~31] for(int k=0; k<32; k+=8){ // psi_i_s[0~31]
for(int j=k; j<k+4; j++) for(int j=k; j<k+4; j++)
...@@ -1566,7 +1566,7 @@ void nr_ulsch_qam64_qam64(c16_t *stream0_in, ...@@ -1566,7 +1566,7 @@ void nr_ulsch_qam64_qam64(c16_t *stream0_in,
y0i_seven_over_sqrt_21 = simde_mm_mulhi_epi16(y0i, SEVEN_OVER_SQRT_42); y0i_seven_over_sqrt_21 = simde_mm_mulhi_epi16(y0i, SEVEN_OVER_SQRT_42);
y0i_seven_over_sqrt_21 = simde_mm_slli_epi16(y0i_seven_over_sqrt_21, 2); // Q2.14 y0i_seven_over_sqrt_21 = simde_mm_slli_epi16(y0i_seven_over_sqrt_21, 2); // Q2.14
simde__m128i y0_s[64] __attribute__((aligned(16))); simde__m128i y0_s[64];
const simde__m128i y0r_over_s[8] = {y0r_seven_over_sqrt_21,y0r_five_over_sqrt_21,y0r_three_over_sqrt_21,y0r_one_over_sqrt_21}; const simde__m128i y0r_over_s[8] = {y0r_seven_over_sqrt_21,y0r_five_over_sqrt_21,y0r_three_over_sqrt_21,y0r_one_over_sqrt_21};
for(int j=0; j<32; j+=8){ for(int j=0; j<32; j+=8){
y0_s[j+0] = simde_mm_adds_epi16(y0r_over_s[j>>3], y0i_seven_over_sqrt_21); y0_s[j+0] = simde_mm_adds_epi16(y0r_over_s[j>>3], y0i_seven_over_sqrt_21);
...@@ -1582,10 +1582,10 @@ void nr_ulsch_qam64_qam64(c16_t *stream0_in, ...@@ -1582,10 +1582,10 @@ void nr_ulsch_qam64_qam64(c16_t *stream0_in,
ch_mag_int_with_sigma2 = simde_mm_srai_epi16(ch_mag_int, 1); // *2 ch_mag_int_with_sigma2 = simde_mm_srai_epi16(ch_mag_int, 1); // *2
two_ch_mag_int_with_sigma2 = ch_mag_int; // *4 two_ch_mag_int_with_sigma2 = ch_mag_int; // *4
three_ch_mag_int_with_sigma2 = simde_mm_adds_epi16(ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2); // *6 three_ch_mag_int_with_sigma2 = simde_mm_adds_epi16(ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2); // *6
simde__m128i a_r_s[64] __attribute__((aligned(16))); simde__m128i a_r_s[64];
simde__m128i a_i_s[64] __attribute__((aligned(16))); simde__m128i a_i_s[64];
simde__m128i psi_a_s[64] __attribute__((aligned(16))); simde__m128i psi_a_s[64];
simde__m128i a_sq_s[64] __attribute__((aligned(16))); simde__m128i a_sq_s[64];
for(int j=0; j<64; j++){ for(int j=0; j<64; j++){
// Detection of interference term // Detection of interference term
a_r_s[j] = interference_abs_64qam_epi16(psi_r_s[j], ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42, SEVEN_OVER_SQRT_2_42); a_r_s[j] = interference_abs_64qam_epi16(psi_r_s[j], ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42, SEVEN_OVER_SQRT_2_42);
...@@ -1634,7 +1634,7 @@ void nr_ulsch_qam64_qam64(c16_t *stream0_in, ...@@ -1634,7 +1634,7 @@ void nr_ulsch_qam64_qam64(c16_t *stream0_in,
ch_mag_98_over_42_with_sigma2 = simde_mm_slli_epi16(ch_mag_98_over_42_with_sigma2, 2); ch_mag_98_over_42_with_sigma2 = simde_mm_slli_epi16(ch_mag_98_over_42_with_sigma2, 2);
// Computing Metrics // Computing Metrics
simde__m128i bit_met_s[64] __attribute__((aligned(16))); simde__m128i bit_met_s[64];
xmm1 = simde_mm_adds_epi16(simde_mm_subs_epi16(psi_a_s[0], a_sq_s[0]), y0_s[0]); xmm1 = simde_mm_adds_epi16(simde_mm_subs_epi16(psi_a_s[0], a_sq_s[0]), y0_s[0]);
bit_met_s[0] = simde_mm_subs_epi16(xmm1, ch_mag_98_over_42_with_sigma2); bit_met_s[0] = simde_mm_subs_epi16(xmm1, ch_mag_98_over_42_with_sigma2);
...@@ -1997,7 +1997,7 @@ void nr_ulsch_qam64_qam64(c16_t *stream0_in, ...@@ -1997,7 +1997,7 @@ void nr_ulsch_qam64_qam64(c16_t *stream0_in,
simde__m256i rho_rmi = simde_mm256_subs_epi16(xmm2, xmm3); // rho* = Re(rho) - Im(rho) simde__m256i rho_rmi = simde_mm256_subs_epi16(xmm2, xmm3); // rho* = Re(rho) - Im(rho)
// Compute the different rhos // Compute the different rhos
simde__m256i rho_rs[32] __attribute__((aligned(16))); simde__m256i rho_rs[32];
rho_rs[27] = simde_mm256_mulhi_epi16(rho_rpi, ONE_OVER_SQRT_42); rho_rs[27] = simde_mm256_mulhi_epi16(rho_rpi, ONE_OVER_SQRT_42);
rho_rs[28] = simde_mm256_mulhi_epi16(rho_rmi, ONE_OVER_SQRT_42); rho_rs[28] = simde_mm256_mulhi_epi16(rho_rmi, ONE_OVER_SQRT_42);
rho_rs[18] = simde_mm256_mulhi_epi16(rho_rpi, THREE_OVER_SQRT_42); rho_rs[18] = simde_mm256_mulhi_epi16(rho_rpi, THREE_OVER_SQRT_42);
...@@ -2061,14 +2061,14 @@ void nr_ulsch_qam64_qam64(c16_t *stream0_in, ...@@ -2061,14 +2061,14 @@ void nr_ulsch_qam64_qam64(c16_t *stream0_in,
xmm0 = simde_mm256_broadcastw_epi16(simde_mm_set1_epi16(0)); // ZERO for abs_pi16 xmm0 = simde_mm256_broadcastw_epi16(simde_mm_set1_epi16(0)); // ZERO for abs_pi16
xmm2 = simde_mm256_subs_epi16(rho_rs[0], y1r); xmm2 = simde_mm256_subs_epi16(rho_rs[0], y1r);
simde__m256i psi_r_s[64] __attribute__((aligned(16))); simde__m256i psi_r_s[64];
for(int j=0; j<32; j++) // psi_r_s[0~31], rho_rs[0~31] for(int j=0; j<32; j++) // psi_r_s[0~31], rho_rs[0~31]
psi_r_s[j] = simde_mm256_abs_epi16(simde_mm256_subs_epi16(rho_rs[j], y1r)); psi_r_s[j] = simde_mm256_abs_epi16(simde_mm256_subs_epi16(rho_rs[j], y1r));
for(int j=32; j<64; j++) // psi_r_s[32~64], rho_rs[31~0] for(int j=32; j<64; j++) // psi_r_s[32~64], rho_rs[31~0]
psi_r_s[j] = simde_mm256_abs_epi16(simde_mm256_adds_epi16(rho_rs[63 - j], y1r)); psi_r_s[j] = simde_mm256_abs_epi16(simde_mm256_adds_epi16(rho_rs[63 - j], y1r));
// simde__m256i psi_i calculation from rho_rpi or rho_rmi // simde__m256i psi_i calculation from rho_rpi or rho_rmi
simde__m256i psi_i_s[64] __attribute__((aligned(16))); simde__m256i psi_i_s[64];
const uint8_t rho_rs_index[32] = {7,15,23,31,24,16,8,0,6,14,22,30,25,17,9,1,5,13,21,29,26,18,10,2,4,12,20,28,27,19,11,3}; const uint8_t rho_rs_index[32] = {7,15,23,31,24,16,8,0,6,14,22,30,25,17,9,1,5,13,21,29,26,18,10,2,4,12,20,28,27,19,11,3};
for(int k=0; k<32; k+=8){ // psi_i_s[0~31] for(int k=0; k<32; k+=8){ // psi_i_s[0~31]
for(int j=k; j<k+4; j++) for(int j=k; j<k+4; j++)
...@@ -2109,7 +2109,7 @@ void nr_ulsch_qam64_qam64(c16_t *stream0_in, ...@@ -2109,7 +2109,7 @@ void nr_ulsch_qam64_qam64(c16_t *stream0_in,
y0i_seven_over_sqrt_21 = simde_mm256_mulhi_epi16(y0i, SEVEN_OVER_SQRT_42); y0i_seven_over_sqrt_21 = simde_mm256_mulhi_epi16(y0i, SEVEN_OVER_SQRT_42);
y0i_seven_over_sqrt_21 = simde_mm256_slli_epi16(y0i_seven_over_sqrt_21, 2); // Q2.14 y0i_seven_over_sqrt_21 = simde_mm256_slli_epi16(y0i_seven_over_sqrt_21, 2); // Q2.14
simde__m256i y0_s[64] __attribute__((aligned(16))); simde__m256i y0_s[64];
const simde__m256i y0r_over_s[8] = {y0r_seven_over_sqrt_21,y0r_five_over_sqrt_21,y0r_three_over_sqrt_21,y0r_one_over_sqrt_21}; const simde__m256i y0r_over_s[8] = {y0r_seven_over_sqrt_21,y0r_five_over_sqrt_21,y0r_three_over_sqrt_21,y0r_one_over_sqrt_21};
for(int j=0; j<32; j+=8){ for(int j=0; j<32; j+=8){
y0_s[j+0] = simde_mm256_adds_epi16(y0r_over_s[j>>3], y0i_seven_over_sqrt_21); y0_s[j+0] = simde_mm256_adds_epi16(y0r_over_s[j>>3], y0i_seven_over_sqrt_21);
...@@ -2125,10 +2125,10 @@ void nr_ulsch_qam64_qam64(c16_t *stream0_in, ...@@ -2125,10 +2125,10 @@ void nr_ulsch_qam64_qam64(c16_t *stream0_in,
ch_mag_int_with_sigma2 = simde_mm256_srai_epi16(ch_mag_int, 1); // *2 ch_mag_int_with_sigma2 = simde_mm256_srai_epi16(ch_mag_int, 1); // *2
two_ch_mag_int_with_sigma2 = ch_mag_int; // *4 two_ch_mag_int_with_sigma2 = ch_mag_int; // *4
three_ch_mag_int_with_sigma2 = simde_mm256_adds_epi16(ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2); // *6 three_ch_mag_int_with_sigma2 = simde_mm256_adds_epi16(ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2); // *6
simde__m256i a_r_s[64] __attribute__((aligned(16))); simde__m256i a_r_s[64];
simde__m256i a_i_s[64] __attribute__((aligned(16))); simde__m256i a_i_s[64];
simde__m256i psi_a_s[64] __attribute__((aligned(16))); simde__m256i psi_a_s[64];
simde__m256i a_sq_s[64] __attribute__((aligned(16))); simde__m256i a_sq_s[64];
for(int j=0; j<64; j++){ for(int j=0; j<64; j++){
// Detection of interference term // Detection of interference term
a_r_s[j] = interference_abs_64qam_epi16_256(psi_r_s[j], ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42, SEVEN_OVER_SQRT_2_42); a_r_s[j] = interference_abs_64qam_epi16_256(psi_r_s[j], ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42, SEVEN_OVER_SQRT_2_42);
...@@ -2177,7 +2177,7 @@ void nr_ulsch_qam64_qam64(c16_t *stream0_in, ...@@ -2177,7 +2177,7 @@ void nr_ulsch_qam64_qam64(c16_t *stream0_in,
ch_mag_98_over_42_with_sigma2 = simde_mm256_slli_epi16(ch_mag_98_over_42_with_sigma2, 2); ch_mag_98_over_42_with_sigma2 = simde_mm256_slli_epi16(ch_mag_98_over_42_with_sigma2, 2);
// Computing Metrics // Computing Metrics
simde__m256i bit_met_s[64] __attribute__((aligned(16))); simde__m256i bit_met_s[64];
xmm1 = simde_mm256_adds_epi16(simde_mm256_subs_epi16(psi_a_s[0], a_sq_s[0]), y0_s[0]); xmm1 = simde_mm256_adds_epi16(simde_mm256_subs_epi16(psi_a_s[0], a_sq_s[0]), y0_s[0]);
bit_met_s[0] = simde_mm256_subs_epi16(xmm1, ch_mag_98_over_42_with_sigma2); bit_met_s[0] = simde_mm256_subs_epi16(xmm1, ch_mag_98_over_42_with_sigma2);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment