Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
O
OpenXG-RAN
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
spbro
OpenXG-RAN
Commits
10d7fe9c
Commit
10d7fe9c
authored
Nov 14, 2023
by
laurent
Committed by
Raymond Knopp
Nov 28, 2023
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix regressions in cuda ldpc
parent
697a7871
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
433 additions
and
468 deletions
+433
-468
openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.cu
openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.cu
+433
-468
No files found.
openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.cu
View file @
10d7fe9c
/*! \file PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.cu
/*! \file PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.cu
* \brief LDPC cuda support BG1 all length
* \brief LDPC cuda support BG1 all length
* \author NCTU OpinConnect Terng-Yin Hsu,WEI-YING,LIN
* \author NCTU OpinConnect Terng-Yin Hsu,WEI-YING,LIN
* \email tyhsu@cs.nctu.edu.tw
* \email tyhsu@cs.nctu.edu.tw
* \date 13-05-2020
* \date 13-05-2020
* \version
* \version
* \note
* \note
* \warning
* \warning
*/
*/
...
@@ -31,17 +31,17 @@
...
@@ -31,17 +31,17 @@
#include "bgs/BG2_I5"
#include "bgs/BG2_I5"
#include "bgs/BG2_I6"
#include "bgs/BG2_I6"
#include "bgs/BG2_I7"
#include "bgs/BG2_I7"
#define MAX_ITERATION 2
#define MAX_ITERATION 2
#define MC 1
#define MC 1
typedef
void
decode_abort_t
;
#define cudaCheck(ans) { cudaAssert((ans), __FILE__, __LINE__); }
#define cudaCheck(ans) { cudaAssert((ans), __FILE__, __LINE__); }
inline
void
cudaAssert
(
cudaError_t
code
,
const
char
*
file
,
int
line
)
inline
void
cudaAssert
(
cudaError_t
code
,
const
char
*
file
,
int
line
)
{
{
if
(
code
!=
cudaSuccess
)
{
if
(
code
!=
cudaSuccess
)
{
fprintf
(
stderr
,
"GPUassert: %s %s %d
\n
"
,
cudaGetErrorString
(
code
),
file
,
line
);
fprintf
(
stderr
,
"GPUassert: %s %s %d
\n
"
,
cudaGetErrorString
(
code
),
file
,
line
);
exit
(
code
);
exit
(
code
);
}
}
}
}
typedef
struct
{
typedef
struct
{
...
@@ -66,540 +66,505 @@ __device__ h_element dev_h_compact2[68*30]; // used in kernel 2
...
@@ -66,540 +66,505 @@ __device__ h_element dev_h_compact2[68*30]; // used in kernel 2
// __device__ __constant__ h_element dev_h_compact2[68*30]; // used in kernel 2
// __device__ __constant__ h_element dev_h_compact2[68*30]; // used in kernel 2
// row and col element count
// row and col element count
__device__
__constant__
char
h_ele_row_bg1_count
[
46
]
=
{
__device__
__constant__
char
h_ele_row_bg1_count
[
46
]
=
{
19
,
19
,
19
,
19
,
3
,
8
,
9
,
7
,
10
,
9
,
7
,
8
,
7
,
6
,
7
,
7
,
6
,
6
,
6
,
6
,
6
,
6
,
5
,
19
,
19
,
19
,
19
,
3
,
8
,
9
,
7
,
10
,
9
,
5
,
6
,
5
,
5
,
4
,
5
,
5
,
5
,
5
,
5
,
5
,
5
,
5
,
5
,
4
,
5
,
5
,
4
,
5
,
4
,
5
,
5
,
4
};
7
,
8
,
7
,
6
,
7
,
7
,
6
,
6
,
6
,
6
,
6
,
6
,
5
,
5
,
6
,
5
,
5
,
4
,
5
,
5
,
5
,
5
,
5
,
5
,
5
,
5
,
5
,
4
,
5
,
5
,
4
,
5
,
4
,
5
,
5
,
4
};
__device__
__constant__
char
h_ele_col_bg1_count
[
68
]
=
{
__device__
__constant__
char
h_ele_col_bg1_count
[
68
]
=
{
30
,
28
,
7
,
11
,
9
,
4
,
8
,
12
,
8
,
7
,
30
,
28
,
7
,
11
,
9
,
4
,
8
,
12
,
8
,
7
,
12
,
10
,
12
,
11
,
10
,
7
,
10
,
10
,
13
,
7
,
8
,
11
,
12
,
5
,
6
,
6
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
12
,
10
,
12
,
11
,
10
,
7
,
10
,
10
,
13
,
7
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
};
8
,
11
,
12
,
5
,
6
,
6
,
1
,
1
,
1
,
1
,
__device__
__constant__
char
h_ele_row_bg2_count
[
42
]
=
{
8
,
10
,
8
,
10
,
4
,
6
,
6
,
6
,
4
,
5
,
5
,
5
,
4
,
5
,
5
,
4
,
5
,
5
,
4
,
4
,
4
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
4
,
3
,
4
,
4
,
3
,
5
,
3
,
4
,
3
,
5
,
3
,
4
,
4
,
4
,
4
,
4
,
3
,
4
,
4
,
4
,
4
};
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
__device__
__constant__
char
h_ele_col_bg2_count
[
52
]
=
{
22
,
23
,
10
,
5
,
5
,
14
,
7
,
13
,
6
,
8
,
9
,
16
,
9
,
12
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
};
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
};
__device__
__constant__
char
h_ele_row_bg2_count
[
42
]
=
{
8
,
10
,
8
,
10
,
4
,
6
,
6
,
6
,
4
,
5
,
5
,
5
,
4
,
5
,
5
,
4
,
5
,
5
,
4
,
4
,
4
,
4
,
3
,
4
,
4
,
3
,
5
,
3
,
4
,
3
,
5
,
3
,
4
,
4
,
4
,
4
,
4
,
3
,
4
,
4
,
4
,
4
};
__device__
__constant__
char
h_ele_col_bg2_count
[
52
]
=
{
22
,
23
,
10
,
5
,
5
,
14
,
7
,
13
,
6
,
8
,
9
,
16
,
9
,
12
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
};
__global__
void
warmup
()
__global__
void
warmup
()
{
{
// warm up gpu for time measurement
// warm up gpu for time measurement
}
}
extern
"C"
extern
"C"
void
warmup_for_GPU
(){
void
warmup_for_GPU
(){
warmup
<<<
20
,
1024
>>>
();
warmup
<<<
20
,
1024
>>>
();
}
}
extern
"C"
extern
"C"
void
set_compact_BG
(
int
Zc
,
short
BG
){
void
set_compact_BG
(
int
Zc
,
short
BG
){
int
row
,
col
;
int
row
,
col
;
if
(
BG
==
1
)
{
if
(
BG
==
1
){
row
=
46
;
row
=
46
;
col
=
68
;
col
=
68
;
}
else
{
}
row
=
42
;
else
{
col
=
52
;
row
=
42
;
}
col
=
52
;
int
compact_row
=
30
;
}
int
compact_col
=
19
;
int
compact_row
=
30
;
if
(
BG
==
2
)
{
int
compact_col
=
19
;
compact_row
=
10
,
compact_col
=
23
;
if
(
BG
==
2
){
compact_row
=
10
,
compact_col
=
23
;}
}
int
memorySize_h_compact1
=
row
*
compact_col
*
sizeof
(
h_element
);
int
memorySize_h_compact1
=
row
*
compact_col
*
sizeof
(
h_element
);
int
memorySize_h_compact2
=
compact_row
*
col
*
sizeof
(
h_element
);
int
memorySize_h_compact2
=
compact_row
*
col
*
sizeof
(
h_element
);
int
lift_index
=
0
;
int
lift_index
=
0
;
short
lift_set
[][
9
]
=
{
short
lift_set
[][
9
]
=
{{
2
,
4
,
8
,
16
,
32
,
64
,
128
,
256
},
{
2
,
4
,
8
,
16
,
32
,
64
,
128
,
256
},
{
3
,
6
,
12
,
24
,
48
,
96
,
192
,
384
},
{
3
,
6
,
12
,
24
,
48
,
96
,
192
,
384
},
{
5
,
10
,
20
,
40
,
80
,
160
,
320
},
{
5
,
10
,
20
,
40
,
80
,
160
,
320
},
{
7
,
14
,
28
,
56
,
112
,
224
},
{
7
,
14
,
28
,
56
,
112
,
224
},
{
9
,
18
,
36
,
72
,
144
,
288
},
{
9
,
18
,
36
,
72
,
144
,
288
},
{
11
,
22
,
44
,
88
,
176
,
352
},
{
11
,
22
,
44
,
88
,
176
,
352
},
{
13
,
26
,
52
,
104
,
208
},
{
13
,
26
,
52
,
104
,
208
},
{
15
,
30
,
60
,
120
,
240
},
{
15
,
30
,
60
,
120
,
240
},
{
0
}};
{
0
}
};
for
(
int
i
=
0
;
lift_set
[
i
][
0
]
!=
0
;
i
++
)
{
for
(
int
j
=
0
;
lift_set
[
i
][
j
]
!=
0
;
j
++
)
{
for
(
int
i
=
0
;
lift_set
[
i
][
0
]
!=
0
;
i
++
){
if
(
Zc
==
lift_set
[
i
][
j
])
{
for
(
int
j
=
0
;
lift_set
[
i
][
j
]
!=
0
;
j
++
){
lift_index
=
i
;
if
(
Zc
==
lift_set
[
i
][
j
]){
break
;
lift_index
=
i
;
}
break
;
}
}
}
}
printf
(
"
\n
Zc = %d BG = %d
\n
"
,
Zc
,
BG
);
}
switch
(
lift_index
)
{
printf
(
"
\n
Zc = %d BG = %d
\n
"
,
Zc
,
BG
);
case
0
:
switch
(
lift_index
){
cudaCheck
(
cudaMemcpyToSymbol
(
dev_h_compact1
,
host_h_compact1_I0
,
memorySize_h_compact1
));
case
0
:
cudaCheck
(
cudaMemcpyToSymbol
(
dev_h_compact2
,
host_h_compact2_I0
,
memorySize_h_compact2
));
cudaCheck
(
cudaMemcpyToSymbol
(
dev_h_compact1
,
host_h_compact1_I0
,
memorySize_h_compact1
)
);
break
;
cudaCheck
(
cudaMemcpyToSymbol
(
dev_h_compact2
,
host_h_compact2_I0
,
memorySize_h_compact2
)
);
case
1
:
break
;
cudaCheck
(
cudaMemcpyToSymbol
(
dev_h_compact1
,
host_h_compact1_I1
,
memorySize_h_compact1
));
case
1
:
cudaCheck
(
cudaMemcpyToSymbol
(
dev_h_compact2
,
host_h_compact2_I1
,
memorySize_h_compact2
));
cudaCheck
(
cudaMemcpyToSymbol
(
dev_h_compact1
,
host_h_compact1_I1
,
memorySize_h_compact1
)
);
break
;
cudaCheck
(
cudaMemcpyToSymbol
(
dev_h_compact2
,
host_h_compact2_I1
,
memorySize_h_compact2
)
);
case
2
:
break
;
cudaCheck
(
cudaMemcpyToSymbol
(
dev_h_compact1
,
host_h_compact1_I2
,
memorySize_h_compact1
));
case
2
:
cudaCheck
(
cudaMemcpyToSymbol
(
dev_h_compact2
,
host_h_compact2_I2
,
memorySize_h_compact2
));
cudaCheck
(
cudaMemcpyToSymbol
(
dev_h_compact1
,
host_h_compact1_I2
,
memorySize_h_compact1
)
);
break
;
cudaCheck
(
cudaMemcpyToSymbol
(
dev_h_compact2
,
host_h_compact2_I2
,
memorySize_h_compact2
)
);
case
3
:
break
;
cudaCheck
(
cudaMemcpyToSymbol
(
dev_h_compact1
,
host_h_compact1_I3
,
memorySize_h_compact1
));
case
3
:
cudaCheck
(
cudaMemcpyToSymbol
(
dev_h_compact2
,
host_h_compact2_I3
,
memorySize_h_compact2
));
cudaCheck
(
cudaMemcpyToSymbol
(
dev_h_compact1
,
host_h_compact1_I3
,
memorySize_h_compact1
)
);
break
;
cudaCheck
(
cudaMemcpyToSymbol
(
dev_h_compact2
,
host_h_compact2_I3
,
memorySize_h_compact2
)
);
case
4
:
break
;
cudaCheck
(
cudaMemcpyToSymbol
(
dev_h_compact1
,
host_h_compact1_I4
,
memorySize_h_compact1
));
case
4
:
cudaCheck
(
cudaMemcpyToSymbol
(
dev_h_compact2
,
host_h_compact2_I4
,
memorySize_h_compact2
));
cudaCheck
(
cudaMemcpyToSymbol
(
dev_h_compact1
,
host_h_compact1_I4
,
memorySize_h_compact1
)
);
break
;
cudaCheck
(
cudaMemcpyToSymbol
(
dev_h_compact2
,
host_h_compact2_I4
,
memorySize_h_compact2
)
);
case
5
:
break
;
cudaCheck
(
cudaMemcpyToSymbol
(
dev_h_compact1
,
host_h_compact1_I5
,
memorySize_h_compact1
));
case
5
:
cudaCheck
(
cudaMemcpyToSymbol
(
dev_h_compact2
,
host_h_compact2_I5
,
memorySize_h_compact2
));
cudaCheck
(
cudaMemcpyToSymbol
(
dev_h_compact1
,
host_h_compact1_I5
,
memorySize_h_compact1
)
);
break
;
cudaCheck
(
cudaMemcpyToSymbol
(
dev_h_compact2
,
host_h_compact2_I5
,
memorySize_h_compact2
)
);
case
6
:
break
;
cudaCheck
(
cudaMemcpyToSymbol
(
dev_h_compact1
,
host_h_compact1_I6
,
memorySize_h_compact1
));
case
6
:
cudaCheck
(
cudaMemcpyToSymbol
(
dev_h_compact2
,
host_h_compact2_I6
,
memorySize_h_compact2
));
cudaCheck
(
cudaMemcpyToSymbol
(
dev_h_compact1
,
host_h_compact1_I6
,
memorySize_h_compact1
)
);
break
;
cudaCheck
(
cudaMemcpyToSymbol
(
dev_h_compact2
,
host_h_compact2_I6
,
memorySize_h_compact2
)
);
case
7
:
break
;
cudaCheck
(
cudaMemcpyToSymbol
(
dev_h_compact1
,
host_h_compact1_I7
,
memorySize_h_compact1
));
case
7
:
cudaCheck
(
cudaMemcpyToSymbol
(
dev_h_compact2
,
host_h_compact2_I7
,
memorySize_h_compact2
));
cudaCheck
(
cudaMemcpyToSymbol
(
dev_h_compact1
,
host_h_compact1_I7
,
memorySize_h_compact1
)
);
break
;
cudaCheck
(
cudaMemcpyToSymbol
(
dev_h_compact2
,
host_h_compact2_I7
,
memorySize_h_compact2
)
);
}
break
;
}
// return 0;
// return 0;
}
}
// Kernel 1
// Kernel 1
__global__
void
ldpc_cnp_kernel_1st_iter
(
/*char * dev_llr,*/
int
BG
,
int
row
,
int
col
,
int
Zc
)
__global__
void
ldpc_cnp_kernel_1st_iter
(
/*char * dev_llr,*/
int
BG
,
int
row
,
int
col
,
int
Zc
)
{
{
// if(blockIdx.x == 0 && threadIdx.x == 1) printf("cnp %d\n", threadIdx.x);
// if(blockIdx.x == 0 && threadIdx.x == 1) printf("cnp %d\n", threadIdx.x);
int
iMCW
=
blockIdx
.
y
;
// codeword id
int
iMCW
=
blockIdx
.
y
;
// codeword id
int
iBlkRow
=
blockIdx
.
x
;
// block row in h_base
int
iBlkRow
=
blockIdx
.
x
;
// block row in h_base
int
iBlkCol
;
// block col in h_base
int
iBlkCol
;
// block col in h_base
int
iSubRow
=
threadIdx
.
x
;
// row index in sub_block of h_base
int
iSubRow
=
threadIdx
.
x
;
// row index in sub_block of h_base
int
iCol
;
// overall col index in h_base
int
iCol
;
// overall col index in h_base
int
offsetR
;
int
offsetR
;
int
shift_t
;
int
shift_t
;
// For 2-min algorithm.
// For 2-min algorithm.
int
Q_sign
=
0
;
int
Q_sign
=
0
;
int
sq
;
int
sq
;
int
Q
,
Q_abs
;
int
Q
,
Q_abs
;
int
R_temp
;
int
R_temp
;
int
sign
=
1
;
int
sign
=
1
;
int
rmin1
=
INT32_MAX
;
int
rmin1
=
INT32_MAX
;
int
rmin2
=
INT32_MAX
;
int
rmin2
=
INT32_MAX
;
char
idx_min
=
0
;
char
idx_min
=
0
;
h_element
h_element_t
;
h_element
h_element_t
;
int
s
=
(
BG
==
1
)
?
h_ele_row_bg1_count
[
iBlkRow
]
:
h_ele_row_bg2_count
[
iBlkRow
];
int
s
=
(
BG
==
1
)
?
h_ele_row_bg1_count
[
iBlkRow
]
:
h_ele_row_bg2_count
[
iBlkRow
];
offsetR
=
(
iMCW
*
row
*
col
*
Zc
)
+
iBlkRow
*
Zc
+
iSubRow
;
// row*col*Zc = size of dev_dt
offsetR
=
(
iMCW
*
row
*
col
*
Zc
)
+
iBlkRow
*
Zc
+
iSubRow
;
// row*col*Zc = size of dev_dt
// if(blockIdx.x == 0 && threadIdx.x == 1) printf("s: %d, offset %d\n", s, offsetR);
// if(blockIdx.x == 0 && threadIdx.x == 1) printf("s: %d, offset %d\n", s, offsetR);
// The 1st recursion
// The 1st recursion
for
(
int
i
=
0
;
i
<
s
;
i
++
)
// loop through all the ZxZ sub-blocks in a row
for
(
int
i
=
0
;
i
<
s
;
i
++
)
// loop through all the ZxZ sub-blocks in a row
{
{
h_element_t
=
dev_h_compact1
[
i
*
row
+
iBlkRow
];
// compact_col == row
h_element_t
=
dev_h_compact1
[
i
*
row
+
iBlkRow
];
// compact_col == row
iBlkCol
=
h_element_t
.
y
;
iBlkCol
=
h_element_t
.
y
;
shift_t
=
h_element_t
.
value
;
shift_t
=
h_element_t
.
value
;
shift_t
=
(
iSubRow
+
shift_t
)
%
Zc
;
shift_t
=
(
iSubRow
+
shift_t
)
%
Zc
;
iCol
=
(
iMCW
*
col
*
Zc
)
+
iBlkCol
*
Zc
+
shift_t
;
// col*Zc = size of llr
iCol
=
(
iMCW
*
col
*
Zc
)
+
iBlkCol
*
Zc
+
shift_t
;
// col*Zc = size of llr
Q
=
dev_llr
[
iCol
];
Q
=
dev_llr
[
iCol
];
Q_abs
=
(
Q
>
0
)
?
Q
:
-
Q
;
Q_abs
=
(
Q
>
0
)
?
Q
:
-
Q
;
sq
=
Q
<
0
;
sq
=
Q
<
0
;
// if(blockIdx.x == 0 && threadIdx.x == 1) printf("i %d, icol %d, Q: %d\n", i, iCol, Q);
// if(blockIdx.x == 0 && threadIdx.x == 1) printf("i %d, icol %d, Q: %d\n", i, iCol, Q);
// quick version
// quick version
sign
=
sign
*
(
1
-
sq
*
2
);
sign
=
sign
*
(
1
-
sq
*
2
);
Q_sign
|=
sq
<<
i
;
Q_sign
|=
sq
<<
i
;
if
(
Q_abs
<
rmin1
)
{
if
(
Q_abs
<
rmin1
)
{
rmin2
=
rmin1
;
rmin2
=
rmin1
;
rmin1
=
Q_abs
;
rmin1
=
Q_abs
;
idx_min
=
i
;
idx_min
=
i
;
}
else
if
(
Q_abs
<
rmin2
)
{
}
else
if
(
Q_abs
<
rmin2
)
{
rmin2
=
Q_abs
;
rmin2
=
Q_abs
;
}
}
}
}
// if(blockIdx.x == 0 && threadIdx.x == 1)printf("min1 %d, min2 %d, min1_idx %d\n", rmin1, rmin2, idx_min);
// if(blockIdx.x == 0 && threadIdx.x == 1)printf("min1 %d, min2 %d, min1_idx %d\n", rmin1, rmin2, idx_min);
// The 2nd recursion
// The 2nd recursion
for
(
int
i
=
0
;
i
<
s
;
i
++
)
{
for
(
int
i
=
0
;
i
<
s
;
i
++
)
{
// v0: Best performance so far. 0.75f is the value of alpha.
// v0: Best performance so far. 0.75f is the value of alpha.
sq
=
1
-
2
*
((
Q_sign
>>
i
)
&
0x01
);
sq
=
1
-
2
*
((
Q_sign
>>
i
)
&
0x01
);
R_temp
=
0.75
f
*
sign
*
sq
*
(
i
!=
idx_min
?
rmin1
:
rmin2
);
R_temp
=
0.75
f
*
sign
*
sq
*
(
i
!=
idx_min
?
rmin1
:
rmin2
);
// write results to global memory
// write results to global memory
h_element_t
=
dev_h_compact1
[
i
*
row
+
iBlkRow
];
h_element_t
=
dev_h_compact1
[
i
*
row
+
iBlkRow
];
int
addr_temp
=
offsetR
+
h_element_t
.
y
*
row
*
Zc
;
int
addr_temp
=
offsetR
+
h_element_t
.
y
*
row
*
Zc
;
dev_dt
[
addr_temp
]
=
R_temp
;
dev_dt
[
addr_temp
]
=
R_temp
;
// if(blockIdx.x == 0 && threadIdx.x == 1)printf("R_temp %d, temp_addr %d\n", R_temp, addr_temp);
// if(blockIdx.x == 0 && threadIdx.x == 1)printf("R_temp %d, temp_addr %d\n", R_temp, addr_temp);
}
}
}
}
// Kernel_1
// Kernel_1
__global__
void
ldpc_cnp_kernel
(
/*char * dev_llr, char * dev_dt,*/
int
BG
,
int
row
,
int
col
,
int
Zc
)
__global__
void
ldpc_cnp_kernel
(
/*char * dev_llr, char * dev_dt,*/
int
BG
,
int
row
,
int
col
,
int
Zc
)
{
{
// if(blockIdx.x == 0 && threadIdx.x == 1) printf("cnp\n");
// if(blockIdx.x == 0 && threadIdx.x == 1) printf("cnp\n");
int
iMCW
=
blockIdx
.
y
;
int
iMCW
=
blockIdx
.
y
;
int
iBlkRow
=
blockIdx
.
x
;
// block row in h_base
int
iBlkRow
=
blockIdx
.
x
;
// block row in h_base
int
iBlkCol
;
// block col in h_base
int
iBlkCol
;
// block col in h_base
int
iSubRow
=
threadIdx
.
x
;
// row index in sub_block of h_base
int
iSubRow
=
threadIdx
.
x
;
// row index in sub_block of h_base
int
iCol
;
// overall col index in h_base
int
iCol
;
// overall col index in h_base
int
offsetR
;
int
offsetR
;
int
shift_t
;
int
shift_t
;
// For 2-min algorithm.
// For 2-min algorithm.
int
Q_sign
=
0
;
int
Q_sign
=
0
;
int
sq
;
int
sq
;
int
Q
,
Q_abs
;
int
Q
,
Q_abs
;
int
R_temp
;
int
R_temp
;
int
sign
=
1
;
int
sign
=
1
;
int
rmin1
=
INT32_MAX
;
int
rmin1
=
INT32_MAX
;
int
rmin2
=
INT32_MAX
;
int
rmin2
=
INT32_MAX
;
char
idx_min
=
0
;
char
idx_min
=
0
;
h_element
h_element_t
;
h_element
h_element_t
;
int
s
=
(
BG
==
1
)
?
h_ele_row_bg1_count
[
iBlkRow
]
:
h_ele_row_bg2_count
[
iBlkRow
];
int
s
=
(
BG
==
1
)
?
h_ele_row_bg1_count
[
iBlkRow
]
:
h_ele_row_bg2_count
[
iBlkRow
];
offsetR
=
(
iMCW
*
row
*
col
*
Zc
)
+
iBlkRow
*
Zc
+
iSubRow
;
// row * col * Zc = size of dev_dt
offsetR
=
(
iMCW
*
row
*
col
*
Zc
)
+
iBlkRow
*
Zc
+
iSubRow
;
// row * col * Zc = size of dev_dt
// if(blockIdx.x == 0 && threadIdx.x == 1) printf("s: %d, offset %d\n", s, offsetR);
// if(blockIdx.x == 0 && threadIdx.x == 1) printf("s: %d, offset %d\n", s, offsetR);
// The 1st recursion
// The 1st recursion
for
(
int
i
=
0
;
i
<
s
;
i
++
)
// loop through all the ZxZ sub-blocks in a row
for
(
int
i
=
0
;
i
<
s
;
i
++
)
// loop through all the ZxZ sub-blocks in a row
{
{
h_element_t
=
dev_h_compact1
[
i
*
row
+
iBlkRow
];
h_element_t
=
dev_h_compact1
[
i
*
row
+
iBlkRow
];
iBlkCol
=
h_element_t
.
y
;
iBlkCol
=
h_element_t
.
y
;
shift_t
=
h_element_t
.
value
;
shift_t
=
h_element_t
.
value
;
shift_t
=
(
iSubRow
+
shift_t
)
%
Zc
;
shift_t
=
(
iSubRow
+
shift_t
)
%
Zc
;
iCol
=
iBlkCol
*
Zc
+
shift_t
;
iCol
=
iBlkCol
*
Zc
+
shift_t
;
R_temp
=
dev_dt
[
offsetR
+
iBlkCol
*
row
*
Zc
];
R_temp
=
dev_dt
[
offsetR
+
iBlkCol
*
row
*
Zc
];
Q
=
dev_llr
[
iMCW
*
(
col
*
Zc
)
+
iCol
]
-
R_temp
;
Q
=
dev_llr
[
iMCW
*
(
col
*
Zc
)
+
iCol
]
-
R_temp
;
Q_abs
=
(
Q
>
0
)
?
Q
:
-
Q
;
Q_abs
=
(
Q
>
0
)
?
Q
:
-
Q
;
// if(blockIdx.x == 0 && threadIdx.x == 1) printf("i %d, icol %d, Q: %d\n", i, iCol, Q);
// if(blockIdx.x == 0 && threadIdx.x == 1) printf("i %d, icol %d, Q: %d\n", i, iCol, Q);
sq
=
Q
<
0
;
sq
=
Q
<
0
;
sign
=
sign
*
(
1
-
sq
*
2
);
sign
=
sign
*
(
1
-
sq
*
2
);
Q_sign
|=
sq
<<
i
;
Q_sign
|=
sq
<<
i
;
if
(
Q_abs
<
rmin1
){
if
(
Q_abs
<
rmin1
)
{
rmin2
=
rmin1
;
rmin2
=
rmin1
;
rmin1
=
Q_abs
;
rmin1
=
Q_abs
;
idx_min
=
i
;
idx_min
=
i
;
}
else
if
(
Q_abs
<
rmin2
){
}
else
if
(
Q_abs
<
rmin2
)
{
rmin2
=
Q_abs
;
rmin2
=
Q_abs
;
}
}
}
}
// if(blockIdx.x == 0 && threadIdx.x == 1)printf("min1 %d, min2 %d, min1_idx %d\n", rmin1, rmin2, idx_min);
// if(blockIdx.x == 0 && threadIdx.x == 1)printf("min1 %d, min2 %d, min1_idx %d\n", rmin1, rmin2, idx_min);
// The 2nd recursion
// The 2nd recursion
for
(
int
i
=
0
;
i
<
s
;
i
++
)
{
for
(
int
i
=
0
;
i
<
s
;
i
++
){
sq
=
1
-
2
*
((
Q_sign
>>
i
)
&
0x01
);
sq
=
1
-
2
*
((
Q_sign
>>
i
)
&
0x01
);
R_temp
=
0.75
f
*
sign
*
sq
*
(
i
!=
idx_min
?
rmin1
:
rmin2
);
R_temp
=
0.75
f
*
sign
*
sq
*
(
i
!=
idx_min
?
rmin1
:
rmin2
);
// write results to global memory
h_element_t
=
dev_h_compact1
[
i
*
row
+
iBlkRow
];
// write results to global memory
int
addr_temp
=
h_element_t
.
y
*
row
*
Zc
+
offsetR
;
h_element_t
=
dev_h_compact1
[
i
*
row
+
iBlkRow
];
dev_dt
[
addr_temp
]
=
R_temp
;
int
addr_temp
=
h_element_t
.
y
*
row
*
Zc
+
offsetR
;
// if(blockIdx.x == 0 && threadIdx.x == 1)printf("R_temp %d, temp_addr %d\n", R_temp, addr_temp);
dev_dt
[
addr_temp
]
=
R_temp
;
}
// if(blockIdx.x == 0 && threadIdx.x == 1)printf("R_temp %d, temp_addr %d\n", R_temp, addr_temp);
}
}
}
// Kernel 2: VNP processing
// Kernel 2: VNP processing
__global__
void
__global__
void
ldpc_vnp_kernel_normal
(
/*char * dev_llr, char * dev_dt, char * dev_const_llr,*/
int
BG
,
int
row
,
int
col
,
int
Zc
)
ldpc_vnp_kernel_normal
(
/*char * dev_llr, char * dev_dt, char * dev_const_llr,*/
int
BG
,
int
row
,
int
col
,
int
Zc
)
{
{
int
iMCW
=
blockIdx
.
y
;
int
iMCW
=
blockIdx
.
y
;
int
iBlkCol
=
blockIdx
.
x
;
int
iBlkCol
=
blockIdx
.
x
;
int
iBlkRow
;
int
iBlkRow
;
int
iSubCol
=
threadIdx
.
x
;
int
iSubCol
=
threadIdx
.
x
;
int
iRow
;
int
iRow
;
int
iCol
;
int
iCol
;
int
shift_t
,
sf
;
int
shift_t
,
sf
;
int
APP
;
int
APP
;
h_element
h_element_t
;
h_element
h_element_t
;
// update all the llr values
// update all the llr values
iCol
=
iBlkCol
*
Zc
+
iSubCol
;
iCol
=
iBlkCol
*
Zc
+
iSubCol
;
APP
=
dev_const_llr
[
iMCW
*
col
*
Zc
+
iCol
];
APP
=
dev_const_llr
[
iMCW
*
col
*
Zc
+
iCol
];
int
offsetDt
=
iMCW
*
row
*
col
*
Zc
+
iBlkCol
*
row
*
Zc
;
int
offsetDt
=
iMCW
*
row
*
col
*
Zc
+
iBlkCol
*
row
*
Zc
;
int
s
=
(
BG
==
1
)
?
h_ele_col_bg1_count
[
iBlkCol
]
:
h_ele_col_bg2_count
[
iBlkCol
];
int
s
=
(
BG
==
1
)
?
h_ele_col_bg1_count
[
iBlkCol
]
:
h_ele_col_bg2_count
[
iBlkCol
];
for
(
int
i
=
0
;
i
<
s
;
i
++
)
for
(
int
i
=
0
;
i
<
s
;
i
++
)
{
{
h_element_t
=
dev_h_compact2
[
i
*
col
+
iBlkCol
];
h_element_t
=
dev_h_compact2
[
i
*
col
+
iBlkCol
];
shift_t
=
h_element_t
.
value
%
Zc
;
shift_t
=
h_element_t
.
value
%
Zc
;
iBlkRow
=
h_element_t
.
x
;
iBlkRow
=
h_element_t
.
x
;
sf
=
iSubCol
-
shift_t
;
sf
=
iSubCol
-
shift_t
;
sf
=
(
sf
+
Zc
)
%
Zc
;
sf
=
(
sf
+
Zc
)
%
Zc
;
iRow
=
iBlkRow
*
Zc
+
sf
;
iRow
=
iBlkRow
*
Zc
+
sf
;
APP
=
APP
+
dev_dt
[
offsetDt
+
iRow
];
APP
=
APP
+
dev_dt
[
offsetDt
+
iRow
];
}
}
if
(
APP
>
SCHAR_MAX
)
if
(
APP
>
SCHAR_MAX
)
APP
=
SCHAR_MAX
;
APP
=
SCHAR_MAX
;
if
(
APP
<
SCHAR_MIN
)
APP
=
SCHAR_MIN
;
if
(
APP
<
SCHAR_MIN
)
// write back to device global memory
APP
=
SCHAR_MIN
;
dev_llr
[
iMCW
*
col
*
Zc
+
iCol
]
=
APP
;
// write back to device global memory
dev_llr
[
iMCW
*
col
*
Zc
+
iCol
]
=
APP
;
}
}
__global__
void
pack_decoded_bit
(
/*char *dev, unsigned char *host,*/
int
col
,
int
Zc
)
__global__
void
pack_decoded_bit
(
/*char *dev, unsigned char *host,*/
int
col
,
int
Zc
)
{
{
__shared__
unsigned
char
tmp
[
128
];
__shared__
unsigned
char
tmp
[
128
];
int
iMCW
=
blockIdx
.
y
;
int
iMCW
=
blockIdx
.
y
;
int
tid
=
iMCW
*
col
*
Zc
+
blockIdx
.
x
*
128
+
threadIdx
.
x
;
int
tid
=
iMCW
*
col
*
Zc
+
blockIdx
.
x
*
128
+
threadIdx
.
x
;
int
btid
=
threadIdx
.
x
;
int
btid
=
threadIdx
.
x
;
tmp
[
btid
]
=
0
;
tmp
[
btid
]
=
0
;
if
(
dev_llr
[
tid
]
<
0
)
{
if
(
dev_llr
[
tid
]
<
0
)
{
tmp
[
btid
]
=
1
<<
(
7
-
(
btid
&
7
));
tmp
[
btid
]
=
1
<<
(
7
-
(
btid
&
7
));
}
}
__syncthreads
();
__syncthreads
();
if
(
threadIdx
.
x
<
16
)
{
if
(
threadIdx
.
x
<
16
)
{
dev_tmp
[
iMCW
*
col
*
Zc
+
blockIdx
.
x
*
16
+
threadIdx
.
x
]
=
0
;
dev_tmp
[
iMCW
*
col
*
Zc
+
blockIdx
.
x
*
16
+
threadIdx
.
x
]
=
0
;
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
dev_tmp
[
iMCW
*
col
*
Zc
+
blockIdx
.
x
*
16
+
threadIdx
.
x
]
+=
tmp
[
threadIdx
.
x
*
8
+
i
];
dev_tmp
[
iMCW
*
col
*
Zc
+
blockIdx
.
x
*
16
+
threadIdx
.
x
]
+=
tmp
[
threadIdx
.
x
*
8
+
i
];
}
}
}
}
}
}
void
read_BG
(
int
BG
,
int
*
h
,
int
row
,
int
col
)
void
read_BG
(
int
BG
,
int
*
h
,
int
row
,
int
col
)
{
{
int
compact_row
=
30
,
compact_col
=
19
;
int
compact_row
=
30
,
compact_col
=
19
;
if
(
BG
==
2
){
compact_row
=
10
,
compact_col
=
23
;}
if
(
BG
==
2
)
{
compact_row
=
10
,
compact_col
=
23
;
h_element
h_element_temp
;
}
// init the compact matrix
h_element
h_element_temp
;
for
(
int
i
=
0
;
i
<
compact_col
;
i
++
){
for
(
int
j
=
0
;
j
<
row
;
j
++
){
// init the compact matrix
h_element_temp
.
x
=
0
;
for
(
int
i
=
0
;
i
<
compact_col
;
i
++
)
{
h_element_temp
.
y
=
0
;
for
(
int
j
=
0
;
j
<
row
;
j
++
)
{
h_element_temp
.
value
=
-
1
;
h_element_temp
.
x
=
0
;
h_compact1
[
i
*
row
+
j
]
=
h_element_temp
;
// h[i][0-11], the same column
h_element_temp
.
y
=
0
;
}
h_element_temp
.
value
=
-
1
;
h_compact1
[
i
*
row
+
j
]
=
h_element_temp
;
// h[i][0-11], the same column
}
}
// scan the h matrix, and gengerate compact mode of h
for
(
int
i
=
0
;
i
<
row
;
i
++
)
{
int
k
=
0
;
for
(
int
j
=
0
;
j
<
col
;
j
++
)
{
if
(
h
[
i
*
col
+
j
]
!=
-
1
)
{
h_element_temp
.
x
=
i
;
h_element_temp
.
y
=
j
;
h_element_temp
.
value
=
h
[
i
*
col
+
j
];
h_compact1
[
k
*
row
+
i
]
=
h_element_temp
;
k
++
;
}
}
}
// h_compact2
// init the compact matrix
for
(
int
i
=
0
;
i
<
compact_row
;
i
++
)
{
for
(
int
j
=
0
;
j
<
col
;
j
++
)
{
h_element_temp
.
x
=
0
;
h_element_temp
.
y
=
0
;
h_element_temp
.
value
=
-
1
;
h_compact2
[
i
*
col
+
j
]
=
h_element_temp
;
}
}
// scan the h matrix, and gengerate compact mode of h
}
for
(
int
i
=
0
;
i
<
row
;
i
++
){
int
k
=
0
;
for
(
int
j
=
0
;
j
<
col
;
j
++
)
{
for
(
int
j
=
0
;
j
<
col
;
j
++
){
int
k
=
0
;
if
(
h
[
i
*
col
+
j
]
!=
-
1
){
for
(
int
i
=
0
;
i
<
row
;
i
++
)
{
h_element_temp
.
x
=
i
;
if
(
h
[
i
*
col
+
j
]
!=
-
1
)
{
h_element_temp
.
y
=
j
;
// although h is transposed, the (x,y) is still (iBlkRow, iBlkCol)
h_element_temp
.
value
=
h
[
i
*
col
+
j
];
h_element_temp
.
x
=
i
;
h_compact1
[
k
*
row
+
i
]
=
h_element_temp
;
h_element_temp
.
y
=
j
;
k
++
;
h_element_temp
.
value
=
h
[
i
*
col
+
j
];
}
h_compact2
[
k
*
col
+
j
]
=
h_element_temp
;
}
k
++
;
}
}
}
}
// h_compact2
// init the compact matrix
/*
for
(
int
i
=
0
;
i
<
compact_row
;
i
++
){
for(int i = 0; i < compact_col; i++){
for
(
int
j
=
0
;
j
<
col
;
j
++
){
for(int j = 0; j < row; j++){
h_element_temp
.
x
=
0
;
printf("%3d, ", h_compact1[i*row+j].value);
h_element_temp
.
y
=
0
;
}
h_element_temp
.
value
=
-
1
;
printf("\n");
h_compact2
[
i
*
col
+
j
]
=
h_element_temp
;
}
}
}
for
(
int
j
=
0
;
j
<
col
;
j
++
){
for(int i = 0; i < compact_row; i++){
int
k
=
0
;
for(int j = 0; j < col; j++){
for
(
int
i
=
0
;
i
<
row
;
i
++
){
printf("%3d,", h_compact2[i*col+j].value);
if
(
h
[
i
*
col
+
j
]
!=
-
1
){
}
// although h is transposed, the (x,y) is still (iBlkRow, iBlkCol)
printf("\n");
h_element_temp
.
x
=
i
;
}
h_element_temp
.
y
=
j
;
*/
h_element_temp
.
value
=
h
[
i
*
col
+
j
];
h_compact2
[
k
*
col
+
j
]
=
h_element_temp
;
k
++
;
}
}
}
/*
for(int i = 0; i < compact_col; i++){
for(int j = 0; j < row; j++){
printf("%3d, ", h_compact1[i*row+j].value);
}
printf("\n");
}
for(int i = 0; i < compact_row; i++){
for(int j = 0; j < col; j++){
printf("%3d,", h_compact2[i*col+j].value);
}
printf("\n");
}
*/
}
}
extern
"C"
extern
"C"
void
init_LLR_DMA
(
t_nrLDPC_dec_params
*
p_decParams
,
int8_t
*
p_llr
,
int8_t
*
p_out
){
void
init_LLR_DMA
(
t_nrLDPC_dec_params
*
p_decParams
,
int8_t
*
p_llr
,
int8_t
*
p_out
){
uint16_t
Zc
=
p_decParams
->
Z
;
uint16_t
Zc
=
p_decParams
->
Z
;
uint8_t
BG
=
p_decParams
->
BG
;
uint8_t
BG
=
p_decParams
->
BG
;
uint8_t
col
;
int
block_length
=
p_decParams
->
block_length
;
if
(
BG
==
1
)
{
uint8_t
row
,
col
;
col
=
68
;
if
(
BG
==
1
){
}
else
{
row
=
46
;
col
=
52
;
col
=
68
;
}
}
int
memorySize_llr_cuda
=
col
*
Zc
*
sizeof
(
char
)
*
MC
;
else
{
cudaCheck
(
cudaMemcpyToSymbol
(
dev_const_llr
,
p_llr
,
memorySize_llr_cuda
));
row
=
42
;
cudaCheck
(
cudaMemcpyToSymbol
(
dev_llr
,
p_llr
,
memorySize_llr_cuda
));
col
=
52
;
cudaDeviceSynchronize
();
}
unsigned
char
*
hard_decision
=
(
unsigned
char
*
)
p_out
;
int
memorySize_llr_cuda
=
col
*
Zc
*
sizeof
(
char
)
*
MC
;
cudaCheck
(
cudaMemcpyToSymbol
(
dev_const_llr
,
p_llr
,
memorySize_llr_cuda
)
);
cudaCheck
(
cudaMemcpyToSymbol
(
dev_llr
,
p_llr
,
memorySize_llr_cuda
)
);
cudaDeviceSynchronize
();
}
}
using
namespace
std
;
using
namespace
std
;
/* from here: entry points in decoder shared lib */
/* from here: entry points in decoder shared lib */
extern
"C"
extern
"C"
int
ldpc_autoinit
(
void
)
{
// called by the library loader
int
ldpc_autoinit
(
void
)
{
// called by the library loader
/*int devices = 0;
/*int devices = 0;
cudaError_t err = cudaGetDeviceCount(&devices);
cudaError_t err = cudaGetDeviceCount(&devices);
AssertFatal(devices>0,"\nNo cuda GPU found\n\n");
AssertFatal(devices>0,"\nNo cuda GPU found\n\n");
const int kb = 1024;
const int kb = 1024;
const int mb = kb * kb;
const int mb = kb * kb;
wcout << "NBody.GPU" << endl << "=========" << endl << endl;
wcout << "NBody.GPU" << endl << "=========" << endl << endl;
wcout << "CUDA version: v" << CUDART_VERSION << endl;
wcout << "CUDA version: v" << CUDART_VERSION << endl;
wcout << "CUDA Devices: " << endl << endl;
wcout << "CUDA Devices: " << endl << endl;
for(int i = 0; i < devices; ++i)
for(int i = 0; i < devices; ++i)
{
{
cudaDeviceProp props;
cudaDeviceProp props;
cudaGetDeviceProperties(&props, i);
cudaGetDeviceProperties(&props, i);
wcout << i << ": " << props.name << ": " << props.major << "." << props.minor << endl;
wcout << i << ": " << props.name << ": " << props.major << "." << props.minor << endl;
wcout << " Global memory: " << props.totalGlobalMem / mb << "mb" << endl;
wcout << " Global memory: " << props.totalGlobalMem / mb << "mb" << endl;
wcout << " Shared memory: " << props.sharedMemPerBlock / kb << "kb" << endl;
wcout << " Shared memory: " << props.sharedMemPerBlock / kb << "kb" << endl;
wcout << " Constant memory: " << props.totalConstMem / kb << "kb" << endl;
wcout << " Constant memory: " << props.totalConstMem / kb << "kb" << endl;
wcout << " Block registers: " << props.regsPerBlock << endl << endl;
wcout << " Block registers: " << props.regsPerBlock << endl << endl;
wcout << " Warp size: " << props.warpSize << endl;
wcout << " Warp size: " << props.warpSize << endl;
wcout << " Threads per block: " << props.maxThreadsPerBlock << endl;
wcout << " Threads per block: " << props.maxThreadsPerBlock << endl;
wcout << " Max block dimensions: [ " << props.maxThreadsDim[0] << ", " << props.maxThreadsDim[1] << ", " << props.maxThreadsDim[2] << " ]" << endl;
wcout << " Max block dimensions: [ " << props.maxThreadsDim[0] << ", " << props.maxThreadsDim[1] << ", " <<
wcout << " Max grid dimensions: [ " << props.maxGridSize[0] << ", " << props.maxGridSize[1] << ", " << props.maxGridSize[2] << " ]" << endl;
props.maxThreadsDim[2] << " ]" << endl; wcout << " Max grid dimensions: [ " << props.maxGridSize[0] << ", " <<
wcout << endl;
props.maxGridSize[1] << ", " << props.maxGridSize[2] << " ]" << endl;
wcout << endl;
}
}
*/
*/
warmup_for_GPU
();
warmup_for_GPU
();
return
0
;
return
0
;
}
}
extern
"C"
void
LDPCinit
(
t_nrLDPC_dec_params
*
p_decParams
,
int8_t
*
p_llr
,
int8_t
*
p_out
)
extern
"C"
int32_t
LDPCinit
(
)
{
{
set_compact_BG
(
p_decParams
->
Z
,
p_decParams
->
BG
);
return
0
;
init_LLR_DMA
(
p_decParams
,
p_llr
,
p_out
);
}
}
extern
"C"
void
LDPCshutdown
()
extern
"C"
void
LDPCshutdown
()
{
{
}
}
extern
"C"
int32_t
LDPCdecoder
(
t_nrLDPC_dec_params
*
p_decParams
,
extern
"C"
int32_t
LDPCdecoder
(
t_nrLDPC_dec_params
*
p_decParams
,
int8_t
*
p_llr
,
uint8_t
harq_pid
,
int8_t
*
p_out
,
uint8_t
ulsch_id
,
t_nrLDPC_procBuf
*
p_procBuf
,
uint8_t
C
,
t_nrLDPC_time_stats
*
time_decoder
)
int8_t
*
p_llr
,
int8_t
*
p_out
,
t_nrLDPC_time_stats
*
,
decode_abort_t
*
ab
)
{
{
uint16_t
Zc
=
p_decParams
->
Z
;
set_compact_BG
(
p_decParams
->
Z
,
p_decParams
->
BG
);
uint8_t
BG
=
p_decParams
->
BG
;
init_LLR_DMA
(
p_decParams
,
p_llr
,
p_out
);
uint8_t
numMaxIter
=
p_decParams
->
numMaxIter
;
uint16_t
Zc
=
p_decParams
->
Z
;
int
block_length
=
p_decParams
->
block_length
;
uint8_t
BG
=
p_decParams
->
BG
;
e_nrLDPC_outMode
outMode
=
p_decParams
->
outMode
;
int
block_length
=
p_decParams
->
block_length
;
cudaError_t
cudaStatus
;
uint8_t
row
,
col
;
uint8_t
row
,
col
;
if
(
BG
==
1
)
{
if
(
BG
==
1
){
row
=
46
;
row
=
46
;
col
=
68
;
col
=
68
;
}
else
{
}
row
=
42
;
else
{
col
=
52
;
row
=
42
;
}
col
=
52
;
}
// alloc memory
unsigned
char
*
hard_decision
=
(
unsigned
char
*
)
p_out
;
// alloc memory
// gpu
unsigned
char
*
hard_decision
=
(
unsigned
char
*
)
p_out
;
int
memorySize_llr_cuda
=
col
*
Zc
*
sizeof
(
char
)
*
MC
;
// gpu
cudaCheck
(
cudaMemcpyToSymbol
(
dev_const_llr
,
p_llr
,
memorySize_llr_cuda
));
int
memorySize_llr_cuda
=
col
*
Zc
*
sizeof
(
char
)
*
MC
;
cudaCheck
(
cudaMemcpyToSymbol
(
dev_llr
,
p_llr
,
memorySize_llr_cuda
));
cudaCheck
(
cudaMemcpyToSymbol
(
dev_const_llr
,
p_llr
,
memorySize_llr_cuda
)
);
cudaCheck
(
cudaMemcpyToSymbol
(
dev_llr
,
p_llr
,
memorySize_llr_cuda
)
);
// Define CUDA kernel dimension
int
blockSizeX
=
Zc
;
// Define CUDA kernel dimension
dim3
dimGridKernel1
(
row
,
MC
,
1
);
// dim of the thread blocks
int
blockSizeX
=
Zc
;
dim3
dimBlockKernel1
(
blockSizeX
,
1
,
1
);
dim3
dimGridKernel1
(
row
,
MC
,
1
);
// dim of the thread blocks
dim3
dimBlockKernel1
(
blockSizeX
,
1
,
1
);
dim3
dimGridKernel2
(
col
,
MC
,
1
);
dim3
dimBlockKernel2
(
blockSizeX
,
1
,
1
);
dim3
dimGridKernel2
(
col
,
MC
,
1
);
cudaDeviceSynchronize
();
dim3
dimBlockKernel2
(
blockSizeX
,
1
,
1
);
cudaDeviceSynchronize
();
// lauch kernel
// lauch kernel
for
(
int
ii
=
0
;
ii
<
MAX_ITERATION
;
ii
++
)
{
// first kernel
for
(
int
ii
=
0
;
ii
<
MAX_ITERATION
;
ii
++
){
if
(
ii
==
0
)
{
// first kernel
ldpc_cnp_kernel_1st_iter
<<<
dimGridKernel1
,
dimBlockKernel1
>>>
(
/*dev_llr,*/
BG
,
row
,
col
,
Zc
);
if
(
ii
==
0
){
}
else
{
ldpc_cnp_kernel_1st_iter
ldpc_cnp_kernel
<<<
dimGridKernel1
,
dimBlockKernel1
>>>
(
/*dev_llr,*/
BG
,
row
,
col
,
Zc
);
<<<
dimGridKernel1
,
dimBlockKernel1
>>>
}
(
/*dev_llr,*/
BG
,
row
,
col
,
Zc
);
// second kernel
}
else
{
ldpc_vnp_kernel_normal
<<<
dimGridKernel2
,
dimBlockKernel2
>>>
ldpc_cnp_kernel
// (dev_llr, dev_const_llr,BG, row, col, Zc);
<<<
dimGridKernel1
,
dimBlockKernel1
>>>
(
BG
,
row
,
col
,
Zc
);
(
/*dev_llr,*/
BG
,
row
,
col
,
Zc
);
}
}
// second kernel
int
pack
=
(
block_length
/
128
)
+
1
;
ldpc_vnp_kernel_normal
dim3
pack_block
(
pack
,
MC
,
1
);
<<<
dimGridKernel2
,
dimBlockKernel2
>>>
pack_decoded_bit
<<<
pack_block
,
128
>>>
(
/*dev_llr,*/
/*dev_tmp,*/
col
,
Zc
);
// (dev_llr, dev_const_llr,BG, row, col, Zc);
(
BG
,
row
,
col
,
Zc
);
cudaCheck
(
cudaMemcpyFromSymbol
((
void
*
)
hard_decision
,
(
const
void
*
)
dev_tmp
,
(
block_length
/
8
)
*
sizeof
(
unsigned
char
)));
}
cudaDeviceSynchronize
();
int
pack
=
(
block_length
/
128
)
+
1
;
return
MAX_ITERATION
;
dim3
pack_block
(
pack
,
MC
,
1
);
pack_decoded_bit
<<<
pack_block
,
128
>>>
(
/*dev_llr,*/
/*dev_tmp,*/
col
,
Zc
);
cudaCheck
(
cudaMemcpyFromSymbol
((
void
*
)
hard_decision
,
(
const
void
*
)
dev_tmp
,
(
block_length
/
8
)
*
sizeof
(
unsigned
char
))
);
cudaDeviceSynchronize
();
return
MAX_ITERATION
;
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment