Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
O
OpenXG-RAN
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
wangjie
OpenXG-RAN
Commits
a3d5d0f8
Commit
a3d5d0f8
authored
Jun 17, 2018
by
Martino
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Kernal operation with avx and avx2 (commented)
parent
a611e0b9
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
61 additions
and
56 deletions
+61
-56
openair1/PHY/CODING/nrPolar_tools/nr_polar_kernal_operation.c
...air1/PHY/CODING/nrPolar_tools/nr_polar_kernal_operation.c
+61
-56
No files found.
openair1/PHY/CODING/nrPolar_tools/nr_polar_kernal_operation.c
View file @
a3d5d0f8
...
@@ -9,78 +9,83 @@ void nr_polar_kernal_operation(uint8_t *u, uint8_t *d, uint16_t N)
...
@@ -9,78 +9,83 @@ void nr_polar_kernal_operation(uint8_t *u, uint8_t *d, uint16_t N)
{
{
// Martino's algorithm to avoid multiplication for the generating matrix of polar codes
// Martino's algorithm to avoid multiplication for the generating matrix of polar codes
uint
16
_t
i
,
j
;
uint
32
_t
i
,
j
;
for
(
i
=
0
;
i
<
N
;
i
++
)
// Create the elements of d=u*G_N ...
for
(
i
=
0
;
i
<
N
;
i
++
)
// Create the elements of d=u*G_N ...
{
{
d
[
i
]
=
0
;
d
[
i
]
=
0
;
for
(
j
=
0
;
j
<
N
;
j
++
)
// ... looking at all the elements of u
for
(
j
=
0
;
j
<
N
;
j
++
)
// ... looking at all the elements of u
{
{
//d[i]=d[i] || ( (!(j-i)) | (!i) )*u[j];
d
[
i
]
=
d
[
i
]
^
(
!
(
(
j
-
i
)
&
i
))
*
u
[
j
];
d
[
i
]
=
d
[
i
]
^
(
!
(
(
j
-
i
)
&
i
))
*
u
[
j
];
}
}
//d[i]=d[i]%2; // modulo 2
}
}
/*
/*
__m256i maddReg, uReg, orReg;
* It works, but there are too many moves from memory and it's slow. With AVX-512 it could be done faster
__m512i maddRegConv;
*
__m256i bitJIReg, bitIReg;
__m256i A,B,C,E, OUT;
uint8_t bitJI[32];
uint8_t bitI[32];
uint32_t dTest[8];
int sumPartial;
uint32_t jiArray[8];
uint8_t indToInit;
uint32_t iArray[8];
uint32_t uArray[8];
uint32_t k;
uint32_t toCheck[8];
for(i=0; i<N; i++) // Create the elements of d=u*G_N ...
for(i=0; i<N; i+=8)
{
{
d[i]=0;
iArray[0]=i;
for(j=0; j<N; j+=32) // ... looking at all the elements of u 32 at a time
iArray[1]=i+1;
{
iArray[2]=i+2;
//d[i]=d[i]+( (!(j-i)) | (!i) )*u[j]; <--- THIS IN INTRINSIC
iArray[3]=i+3;
// Products between ( (!(j-i)) | (!i) ) and u[j] and sum all with a reduce add
iArray[4]=i+4;
iArray[5]=i+5;
iArray[6]=i+6;
iArray[7]=i+7;
uReg = _mm256_maskz_loadu_epi8 (0xFFFFFFFF, (void const*)&u[j]); // load 32 8-bit from u
OUT=_mm256_setzero_si256();
for(j=0; j<N; j++)
//init arrays for (!(i-j)) and for (!i)
{
for(indToInit=0; indToInit<32; indToInit++)
//initialisation
{
jiArray[0]=j-i;
// j = j*32+indToInit
jiArray[1]=j-(i+1);
bitJI[j*32+indToInit] = !((j*32+indToInit)-i); // (!(j-i))
jiArray[2]=j-(i+2);
bitI[j*32+indToInit] = !i; // (!i)
jiArray[3]=j-(i+3);
}
jiArray[4]=j-(i+4);
jiArray[5]=j-(i+5);
bitJIReg = _mm256_maskz_loadu_epi8(0xFFFFFFFF, (void const*)bitJI); // 32x8-bit
jiArray[6]=j-(i+6);
bitIReg = _mm256_maskz_loadu_epi8(0xFFFFFFFF, (void const*)bitI); // 32x8-bit
jiArray[7]=j-(i+7);
orReg=_mm256_or_si256(bitWise1, bitWise2); // (!(j-i)) | (!i) 32x8-bit
maddReg=_mm256_maddubs_epi16(uReg, orReg); //a1*b1+a2*b2 from 32x8 to 16x16-bit
maddRegConv= _mm512_cvtepi16_epi32(maddReg); //convert to 16x32-bit
sumPartial = _mm512_reduce_add_epi32(maddRegConv); //sum all 16 values
d[i] = d[i] + sumPartial; //store in the final variable
uArray[0]=(uint32_t)u[j];
}
uArray[1]=(uint32_t)u[j];
uArray[2]=(uint32_t)u[j];
uArray[3]=(uint32_t)u[j];
uArray[4]=(uint32_t)u[j];
uArray[5]=(uint32_t)u[j];
uArray[6]=(uint32_t)u[j];
uArray[7]=(uint32_t)u[j];
A=_mm256_loadu_si256((__m256i const*)jiArray);
B=_mm256_loadu_si256((__m256i const*)iArray);
C=_mm256_and_si256(A, B); //mask: if zero, then add
d[i]=d[i]%2; // modulo 2
_mm256_storeu_si256((__m256i*)toCheck, C);
}
for(k=0; k<8; k++)
*/
{
toCheck[k]=!toCheck[k] << 31;
}
C=_mm256_loadu_si256((__m256i const*)toCheck); //mask: if 1, add
/*
E=_mm256_maskload_epi32((int const*)uArray, C);
__m128 num1, num2, num3, num4;
OUT=_mm256_xor_si256(OUT, E); //32 bit x 8
}
_mm256_storeu_si256((__m256i*)&dTest, OUT);
for (uint16_t i = 0; i < col; i++) {
for(k=0; k<8; k++)
num4=_mm_setzero_ps(); //sets sum to zero
{
for (uint16_t j = 0; j < row; j+=4) {
d[i+k]=(uint8_t)dTest[k]; //Conv from 32 to 8
//output[i] += matrix1[j] * matrix2[j][i];
num1=_mm_load_ps((float*)&matrix1[j]); // 1[3], 1[2], 1[1], 1[0] -> num1
num2=_mm_load_ps((float*)&matrix2[j][i]); // 2[3], 2[2], 2[1], 2[0] -> num2
num3=_mm_mul_ps(num1, num2); // 1[3]*2[3],...1[0]*2[0] -> num3
num3=_mm_hadd_ps(num3, num3); //1[3]*2[3]+1[2]*2[2] ...
num4 = _mm_add_ps(num4, num3);
}
}
num4= _mm_hadd_ps(num4, num4);
_mm_store_ss(&output[i], num4); // Stores only the lower SP FP that contain the sum
}
}
*/
*/
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment