Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
O
OpenXG-RAN
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
1
Merge Requests
1
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Metrics
Environments
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
OpenXG
OpenXG-RAN
Commits
4345d42e
Commit
4345d42e
authored
Oct 05, 2021
by
frtabu
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
opencl implementation of ldpc decoder (platform query)
parent
d31ac580
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
321 additions
and
22 deletions
+321
-22
openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_decoder_CL.c
openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_decoder_CL.c
+60
-22
openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_decoder_kernels_CL.cl
...r1/PHY/CODING/nrLDPC_decoder/nrLDPC_decoder_kernels_CL.cl
+261
-0
No files found.
openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_decoder_CL.c
View file @
4345d42e
...
@@ -39,8 +39,8 @@
...
@@ -39,8 +39,8 @@
#define MAX_ITERATION 2
#define MAX_ITERATION 2
#define MC 1
#define MC 1
#define MAX_OCLDEV 10
#define MAX_OCLDEV
10
#define MAX_OCLRUNTIME 5
typedef
struct
{
typedef
struct
{
char
x
;
char
x
;
char
y
;
char
y
;
...
@@ -51,14 +51,20 @@ typedef struct{
...
@@ -51,14 +51,20 @@ typedef struct{
cl_uint
max_CU
;
cl_uint
max_CU
;
cl_uint
max_WID
;
cl_uint
max_WID
;
size_t
*
max_WIS
;
size_t
*
max_WIS
;
}
ocldev_t
;
typedef
struct
{
cl_uint
num_devices
;
cl_device_id
devices
[
MAX_OCLDEV
];
ocldev_t
ocldev
[
MAX_OCLDEV
];
cl_context
context
;
cl_context
context
;
cl_program
program
;
cl_program
program
;
cl_kernel
kernel
;
cl_kernel
kernel
;
cl_command_queue
queue
;
cl_command_queue
queue
[
MAX_OCLDEV
]
;
}
ocl
dev
_t
;
}
ocl
runtime
_t
;
typedef
struct
{
typedef
struct
{
ocl
dev_t
ocldev
[
MAX_OCLDEV
];
ocl
runtime_t
runtime
[
MAX_OCLRUNTIME
];
}
ocl_t
;
}
ocl_t
;
...
@@ -86,7 +92,8 @@ void init_LLR_DMA(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8_t* p_out
...
@@ -86,7 +92,8 @@ void init_LLR_DMA(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8_t* p_out
// cudaDeviceSynchronize();
// cudaDeviceSynchronize();
}
}
cl_error_callback
(
const
char
*
errinfo
,
const
void
*
private_info
,
size_t
cb
,
void
*
user_data
)
{
void
cl_error_callback
(
const
char
*
errinfo
,
const
void
*
private_info
,
size_t
cb
,
void
*
user_data
)
{
oclruntime_t
*
runtime
=
(
oclruntime_t
*
)
user_data
;
LOG_E
(
HW
,
"OpenCL accelerator error %s
\n
"
,
errinfo
);
LOG_E
(
HW
,
"OpenCL accelerator error %s
\n
"
,
errinfo
);
}
}
...
@@ -103,11 +110,25 @@ char *clutil_getstrdev(int intdev) {
...
@@ -103,11 +110,25 @@ char *clutil_getstrdev(int intdev) {
return
retstring
;
return
retstring
;
}
}
size_t
load_source
(
char
**
source_str
)
{
int
MAX_SOURCE_SIZE
=
(
500
*
132
);
FILE
*
fp
;
size_t
source_size
;
fp
=
fopen
(
"nrLDPC_decoder_kernels_CL.cl"
,
"r"
);
AssertFatal
(
fp
,
"failed to open cl source: %s
\n
"
,
strerror
(
errno
));
*
source_str
=
(
char
*
)
malloc
(
MAX_SOURCE_SIZE
);
source_size
=
fread
(
*
source_str
,
1
,
MAX_SOURCE_SIZE
,
fp
);
fclose
(
fp
);
return
source_size
;
}
/* from here: entry points in decoder shared lib */
/* from here: entry points in decoder shared lib */
int
ldpc_autoinit
(
void
)
{
// called by the library loader
int
ldpc_autoinit
(
void
)
{
// called by the library loader
cl_platform_id
platforms
[
10
];
cl_platform_id
platforms
[
10
];
cl_uint
num_platforms_found
;
cl_uint
num_platforms_found
;
int
context_ok
=
0
;
cl_uint
rt
=
clGetPlatformIDs
(
sizeof
(
platforms
)
/
sizeof
(
cl_platform_id
),
platforms
,
&
num_platforms_found
);
cl_uint
rt
=
clGetPlatformIDs
(
sizeof
(
platforms
)
/
sizeof
(
cl_platform_id
),
platforms
,
&
num_platforms_found
);
AssertFatal
(
rt
==
CL_SUCCESS
,
"clGetPlatformIDs error %d
\n
"
,
(
int
)
rt
);
AssertFatal
(
rt
==
CL_SUCCESS
,
"clGetPlatformIDs error %d
\n
"
,
(
int
)
rt
);
AssertFatal
(
num_platforms_found
>
0
,
"clGetPlatformIDs: no cl compatible platform found
\n
"
);
AssertFatal
(
num_platforms_found
>
0
,
"clGetPlatformIDs: no cl compatible platform found
\n
"
);
...
@@ -119,33 +140,50 @@ int ldpc_autoinit(void) { // called by the library loader
...
@@ -119,33 +140,50 @@ int ldpc_autoinit(void) { // called by the library loader
rt
=
clGetPlatformInfo
(
platforms
[
i
],
CL_PLATFORM_VERSION
,
sizeof
(
stringval
),
stringval
,
NULL
);
rt
=
clGetPlatformInfo
(
platforms
[
i
],
CL_PLATFORM_VERSION
,
sizeof
(
stringval
),
stringval
,
NULL
);
AssertFatal
(
rt
==
CL_SUCCESS
,
"clGetPlatformInfo VERSION error %d
\n
"
,
(
int
)
rt
);
AssertFatal
(
rt
==
CL_SUCCESS
,
"clGetPlatformInfo VERSION error %d
\n
"
,
(
int
)
rt
);
LOG_I
(
HW
,
"Platform %i, OpenCL version %s
\n
"
,
i
,
stringval
);
LOG_I
(
HW
,
"Platform %i, OpenCL version %s
\n
"
,
i
,
stringval
);
cl_device_id
devices
[
20
];
rt
=
clGetDeviceIDs
(
platforms
[
i
],
CL_DEVICE_TYPE_ALL
,
sizeof
(
ocl
.
runtime
[
i
].
devices
)
/
sizeof
(
cl_device_id
),
ocl
.
runtime
[
i
].
devices
,
&
(
ocl
.
runtime
[
i
].
num_devices
));
cl_uint
num_devices_found
;
rt
=
clGetDeviceIDs
(
platforms
[
i
],
CL_DEVICE_TYPE_ALL
,
sizeof
(
devices
)
/
sizeof
(
cl_device_id
),
devices
,
&
num_devices_found
);
AssertFatal
(
rt
==
CL_SUCCESS
,
"clGetDeviceIDs error %d
\n
"
,
(
int
)
rt
);
AssertFatal
(
rt
==
CL_SUCCESS
,
"clGetDeviceIDs error %d
\n
"
,
(
int
)
rt
);
for
(
int
j
=
0
;
j
<
num_devices_found
;
j
++
)
{
int
devok
=
0
;
for
(
int
j
=
0
;
j
<
ocl
.
runtime
[
i
].
num_devices
;
j
++
)
{
cl_bool
abool
;
cl_bool
abool
;
rt
=
clGetDeviceInfo
(
devices
[
j
],
CL_DEVICE_AVAILABLE
,
sizeof
(
abool
),
&
abool
,
NULL
);
rt
=
clGetDeviceInfo
(
ocl
.
runtime
[
i
].
devices
[
j
],
CL_DEVICE_AVAILABLE
,
sizeof
(
abool
),
&
abool
,
NULL
);
AssertFatal
(
rt
==
CL_SUCCESS
,
"clGetDeviceInfo DEVICE_AVAILABLE error %d
\n
"
,
(
int
)
rt
);
AssertFatal
(
rt
==
CL_SUCCESS
,
"clGetDeviceInfo DEVICE_AVAILABLE error %d
\n
"
,
(
int
)
rt
);
LOG_I
(
HW
,
"Device %i is %s available
\n
"
,
j
,
(
abool
==
CL_TRUE
?
""
:
"not"
));
LOG_I
(
HW
,
"Device %i is %s available
\n
"
,
j
,
(
abool
==
CL_TRUE
?
""
:
"not"
));
cl_device_type
devtype
;
cl_device_type
devtype
;
rt
=
clGetDeviceInfo
(
devices
[
j
],
CL_DEVICE_TYPE
,
sizeof
(
cl_device_type
),
&
devtype
,
NULL
);
rt
=
clGetDeviceInfo
(
ocl
.
runtime
[
i
].
devices
[
j
],
CL_DEVICE_TYPE
,
sizeof
(
cl_device_type
),
&
devtype
,
NULL
);
AssertFatal
(
rt
==
CL_SUCCESS
,
"clGetDeviceInfo DEVICE_TYPE error %d
\n
"
,
(
int
)
rt
);
AssertFatal
(
rt
==
CL_SUCCESS
,
"clGetDeviceInfo DEVICE_TYPE error %d
\n
"
,
(
int
)
rt
);
LOG_I
(
HW
,
"Device %i, type %d = %s
\n
"
,
j
,(
int
)
devtype
,
clutil_getstrdev
(
devtype
));
LOG_I
(
HW
,
"Device %i, type %d = %s
\n
"
,
j
,(
int
)
devtype
,
clutil_getstrdev
(
devtype
));
rt
=
clGetDeviceInfo
(
devices
[
j
],
CL_DEVICE_MAX_COMPUTE_UNITS
,
sizeof
(
ocl
.
ocldev
[
j
].
max_CU
),
&
(
ocl
.
ocldev
[
j
].
max_CU
),
NULL
);
rt
=
clGetDeviceInfo
(
ocl
.
runtime
[
i
].
devices
[
j
],
CL_DEVICE_MAX_COMPUTE_UNITS
,
sizeof
(
ocl
.
runtime
[
i
].
ocldev
[
j
].
max_CU
),
&
(
ocl
.
runtime
[
i
]
.
ocldev
[
j
].
max_CU
),
NULL
);
AssertFatal
(
rt
==
CL_SUCCESS
,
"clGetDeviceInfo MAX_COMPUTE_UNITS error %d
\n
"
,
(
int
)
rt
);
AssertFatal
(
rt
==
CL_SUCCESS
,
"clGetDeviceInfo MAX_COMPUTE_UNITS error %d
\n
"
,
(
int
)
rt
);
LOG_I
(
HW
,
"Device %i, number of Compute Units: %d
\n
"
,
j
,
ocl
.
ocldev
[
j
].
max_CU
);
LOG_I
(
HW
,
"Device %i, number of Compute Units: %d
\n
"
,
j
,
ocl
.
runtime
[
i
].
ocldev
[
j
].
max_CU
);
rt
=
clGetDeviceInfo
(
devices
[
j
],
CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS
,
sizeof
(
ocl
.
ocldev
[
j
].
max_WID
),
&
(
ocl
.
ocldev
[
j
].
max_WID
),
NULL
);
rt
=
clGetDeviceInfo
(
ocl
.
runtime
[
i
].
devices
[
j
],
CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS
,
sizeof
(
ocl
.
runtime
[
i
].
ocldev
[
j
].
max_WID
),
&
(
ocl
.
runtime
[
i
]
.
ocldev
[
j
].
max_WID
),
NULL
);
AssertFatal
(
rt
==
CL_SUCCESS
,
"clGetDeviceInfo MAX_WORK_ITEM_DIMENSIONS error %d
\n
"
,
(
int
)
rt
);
AssertFatal
(
rt
==
CL_SUCCESS
,
"clGetDeviceInfo MAX_WORK_ITEM_DIMENSIONS error %d
\n
"
,
(
int
)
rt
);
LOG_I
(
HW
,
"Device %i, max Work Items dimension: %d
\n
"
,
j
,
ocl
.
ocldev
[
j
].
max_WID
);
LOG_I
(
HW
,
"Device %i, max Work Items dimension: %d
\n
"
,
j
,
ocl
.
runtime
[
i
].
ocldev
[
j
].
max_WID
);
ocl
.
ocldev
[
j
].
max_WIS
=
(
size_t
*
)
malloc
(
ocl
.
ocldev
[
j
].
max_WID
*
sizeof
(
size_t
));
ocl
.
runtime
[
i
].
ocldev
[
j
].
max_WIS
=
(
size_t
*
)
malloc
(
ocl
.
runtime
[
i
]
.
ocldev
[
j
].
max_WID
*
sizeof
(
size_t
));
rt
=
clGetDeviceInfo
(
devices
[
j
],
CL_DEVICE_MAX_WORK_ITEM_SIZES
,
sizeof
(
ocl
.
ocldev
[
j
].
max_WID
)
*
sizeof
(
size_t
),
ocl
.
ocldev
[
j
].
max_WIS
,
NULL
);
rt
=
clGetDeviceInfo
(
ocl
.
runtime
[
i
].
devices
[
j
],
CL_DEVICE_MAX_WORK_ITEM_SIZES
,
sizeof
(
ocl
.
runtime
[
i
].
ocldev
[
j
].
max_WID
)
*
sizeof
(
size_t
),
ocl
.
runtime
[
i
]
.
ocldev
[
j
].
max_WIS
,
NULL
);
AssertFatal
(
rt
==
CL_SUCCESS
,
"clGetDeviceInfo MAX_WORK_ITEM_SIZES error %d
\n
"
,
(
int
)
rt
);
AssertFatal
(
rt
==
CL_SUCCESS
,
"clGetDeviceInfo MAX_WORK_ITEM_SIZES error %d
\n
"
,
(
int
)
rt
);
for
(
int
k
=
0
;
k
<
ocl
.
ocldev
[
j
].
max_WID
;
k
++
)
for
(
int
k
=
0
;
k
<
ocl
.
runtime
[
i
].
ocldev
[
j
].
max_WID
;
k
++
)
LOG_I
(
HW
,
"Device %i, max Work Items size for dimension: %d %u
\n
"
,
j
,
k
,(
uint32_t
)
ocl
.
ocldev
[
j
].
max_WIS
[
k
]);
LOG_I
(
HW
,
"Device %i, max Work Items size for dimension: %d %u
\n
"
,
j
,
k
,(
uint32_t
)
ocl
.
runtime
[
i
].
ocldev
[
j
].
max_WIS
[
k
]);
devok
++
;
}
if
(
devok
>
0
)
{
ocl
.
runtime
[
i
].
context
=
clCreateContext
(
NULL
,
ocl
.
runtime
[
i
].
num_devices
,
ocl
.
runtime
[
i
].
devices
,
cl_error_callback
,
&
(
ocl
.
runtime
[
i
]),
(
cl_int
*
)
&
rt
);
AssertFatal
(
rt
==
CL_SUCCESS
,
"Error %d creating context for platform %i
\n
"
,
(
int
)
rt
,
i
);
for
(
int
dev
=
0
;
dev
<
ocl
.
runtime
[
i
].
num_devices
;
dev
++
)
{
ocl
.
runtime
[
i
].
queue
[
dev
]
=
clCreateCommandQueueWithProperties
(
ocl
.
runtime
[
i
].
context
,
ocl
.
runtime
[
i
].
devices
[
dev
]
,
0
,
(
cl_int
*
)
&
rt
);
AssertFatal
(
rt
==
CL_SUCCESS
,
"Error %d creating command queue for platform %i device %i
\n
"
,
(
int
)
rt
,
i
,
dev
);
}
char
*
source_str
;
size_t
source_size
=
load_source
(
&
source_str
);
cl_program
program
=
clCreateProgramWithSource
(
ocl
.
runtime
[
i
].
context
,
1
,
(
const
char
**
)
&
source_str
,
(
const
size_t
*
)
&
source_size
,
(
cl_int
*
)
&
rt
);
AssertFatal
(
rt
==
CL_SUCCESS
,
"Error %d creating program for platform %i
\n
"
,
(
int
)
rt
,
i
);
rt
=
clBuildProgram
(
program
,
ocl
.
runtime
[
i
].
num_devices
,
ocl
.
runtime
[
i
].
devices
,
NULL
,
NULL
,
NULL
);
AssertFatal
(
rt
==
CL_SUCCESS
,
"Error %d buildding program for platform %i
\n
"
,
rt
,
i
);
context_ok
++
;
}
}
devok
=
0
;
}
}
AssertFatal
(
context_ok
>
0
,
"No openCL device available to accelerate ldpc
\n
"
);
return
0
;
return
0
;
}
}
...
...
openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_decoder_kernels_CL.cl
0 → 100644
View file @
4345d42e
define
MAX_ITERATION
2
#
define
MC
1
typedef
struct{
char
x
;
char
y
;
short
value
;
}
h_element
;
char
dev_const_llr[68*384]
;
char
dev_dt
[46*68*384]
;
char
dev_llr[68*384]
;
unsigned
char
dev_tmp[68*384]
;
h_element
h_compact1
[46*19]
=
{}
;
h_element
h_compact2
[68*30]
=
{}
;
h_element
dev_h_compact1[46*19]
; // used in kernel 1
h_element
dev_h_compact2[68*30]
; // used in kernel 2
//
__device__
__constant__
h_element
dev_h_compact1[46*19]
; // used in kernel 1
//
__device__
__constant__
h_element
dev_h_compact2[68*30]
; // used in kernel 2
//
row
and
col
element
count
const
char
h_ele_row_bg1_count[46]
=
{
19
,
19
,
19
,
19
,
3
,
8
,
9
,
7
,
10
,
9
,
7
,
8
,
7
,
6
,
7
,
7
,
6
,
6
,
6
,
6
,
6
,
6
,
5
,
5
,
6
,
5
,
5
,
4
,
5
,
5
,
5
,
5
,
5
,
5
,
5
,
5
,
5
,
4
,
5
,
5
,
4
,
5
,
4
,
5
,
5
,
4}
;
const
char
h_ele_col_bg1_count[68]
=
{
30
,
28
,
7
,
11
,
9
,
4
,
8
,
12
,
8
,
7
,
12
,
10
,
12
,
11
,
10
,
7
,
10
,
10
,
13
,
7
,
8
,
11
,
12
,
5
,
6
,
6
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1}
;
const
char
h_ele_row_bg2_count[42]
=
{
8
,
10
,
8
,
10
,
4
,
6
,
6
,
6
,
4
,
5
,
5
,
5
,
4
,
5
,
5
,
4
,
5
,
5
,
4
,
4
,
4
,
4
,
3
,
4
,
4
,
3
,
5
,
3
,
4
,
3
,
5
,
3
,
4
,
4
,
4
,
4
,
4
,
3
,
4
,
4
,
4
,
4}
;
const
char
h_ele_col_bg2_count[52]
=
{
22
,
23
,
10
,
5
,
5
,
14
,
7
,
13
,
6
,
8
,
9
,
16
,
9
,
12
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1}
;
//
Kernel
1
__kernel
void
ldpc_cnp_kernel_1st_iter
(
/*char
*
dev_llr,*/
int
BG,
int
row,
int
col,
int
Zc
)
{
//
if
(
blockIdx.x
==
0
&&
threadIdx.x
==
1
)
printf
(
"cnp %d\n"
,
threadIdx.x
)
;
int
iMCW
=
blockIdx.y
; // codeword id
int
iBlkRow
=
blockIdx.x
; // block row in h_base
int
iBlkCol
; // block col in h_base
int
iSubRow
=
threadIdx.x
; // row index in sub_block of h_base
int
iCol
; // overall col index in h_base
int
offsetR
;
int
shift_t
;
//
For
2-min
algorithm.
int
Q_sign
=
0
;
int
sq
;
int
Q,
Q_abs
;
int
R_temp
;
int
sign
=
1
;
int
rmin1
=
INT32_MAX
;
int
rmin2
=
INT32_MAX
;
char
idx_min
=
0
;
h_element
h_element_t
;
int
s
=
(
BG==1
)
?
h_ele_row_bg1_count[iBlkRow]:h_ele_row_bg2_count[iBlkRow]
;
offsetR
=
(
iMCW
*
row*col*Zc
)
+
iBlkRow
*
Zc
+
iSubRow
; // row*col*Zc = size of dev_dt
//
if
(
blockIdx.x
==
0
&&
threadIdx.x
==
1
)
printf
(
"s: %d, offset %d\n"
,
s,
offsetR
)
;
//
The
1st
recursion
for
(
int
i
=
0
; i < s; i++) // loop through all the ZxZ sub-blocks in a row
{
h_element_t
=
dev_h_compact1[i*row+iBlkRow]
; // compact_col == row
iBlkCol
=
h_element_t.y
;
shift_t
=
h_element_t.value
;
shift_t
=
(
iSubRow
+
shift_t
)
%
Zc
;
iCol
=
(
iMCW
*
col*Zc
)
+
iBlkCol
*
Zc
+
shift_t
; // col*Zc = size of llr
Q
=
dev_llr[iCol]
;
Q_abs
=
(
Q>0
)
?
Q
:
-Q
;
sq
=
Q
<
0
;
//
if
(
blockIdx.x
==
0
&&
threadIdx.x
==
1
)
printf
(
"i %d, icol %d, Q: %d\n"
,
i,
iCol,
Q
)
;
//
quick
version
sign
=
sign
*
(
1
-
sq
*
2
)
;
Q_sign
|= sq << i;
if (Q_abs < rmin1){
rmin2 = rmin1;
rmin1 = Q_abs;
idx_min = i;
} else if (Q_abs < rmin2){
rmin2 = Q_abs;
}
}
// if(blockIdx.x == 0 && threadIdx.x == 1)printf("min1 %d, min2 %d, min1_idx %d\n", rmin1, rmin2, idx_min);
// The 2nd recursion
for(int i = 0; i < s; i++){
// v0: Best performance so far. 0.75f is the value of alpha.
sq = 1 - 2 * ((Q_sign >> i) & 0x01);
R_temp = 0.75f * sign * sq * (i != idx_min ? rmin1 : rmin2);
// write results to global memory
h_element_t = dev_h_compact1[i*row+iBlkRow];
int addr_temp = offsetR + h_element_t.y * row * Zc;
dev_dt[addr_temp] = R_temp;
// if(blockIdx.x == 0 && threadIdx.x == 1)printf("R_temp %d, temp_addr %d\n", R_temp, addr_temp);
}
}
// Kernel_1
__kernel void ldpc_cnp_kernel(/*char * dev_llr, char * dev_dt,*/ int BG, int row, int col, int Zc)
{
// if(blockIdx.x == 0 && threadIdx.x == 1) printf("cnp\n");
int iMCW = blockIdx.y;
int iBlkRow = blockIdx.x; // block row in h_base
int iBlkCol; // block col in h_base
int iSubRow = threadIdx.x; // row index in sub_block of h_base
int iCol; // overall col index in h_base
int offsetR;
int shift_t;
// For 2-min algorithm.
int Q_sign = 0;
int sq;
int Q, Q_abs;
int R_temp;
int sign = 1;
int rmin1 = INT32_MAX;
int rmin2 = INT32_MAX;
char idx_min = 0;
h_element h_element_t;
int s = (BG==1)? h_ele_row_bg1_count[iBlkRow]: h_ele_row_bg2_count[iBlkRow];
offsetR = (iMCW *row*col*Zc) + iBlkRow * Zc + iSubRow; // row * col * Zc = size of dev_dt
// if(blockIdx.x == 0 && threadIdx.x == 1) printf("s: %d, offset %d\n", s, offsetR);
// The 1st recursion
for(int i = 0; i < s; i++) // loop through all the ZxZ sub-blocks in a row
{
h_element_t = dev_h_compact1[i*row+iBlkRow];
iBlkCol = h_element_t.y;
shift_t = h_element_t.value;
shift_t = (iSubRow + shift_t) % Zc;
iCol = iBlkCol * Zc + shift_t;
R_temp = dev_dt[offsetR + iBlkCol * row * Zc];
Q = dev_llr[iMCW * (col*Zc) + iCol] - R_temp;
Q_abs = (Q>0)? Q : -Q;
// if(blockIdx.x == 0 && threadIdx.x == 1) printf("i %d, icol %d, Q: %d\n", i, iCol, Q);
sq = Q < 0;
sign = sign * (1 - sq * 2);
Q_sign |
=
sq
<<
i
;
if
(
Q_abs
<
rmin1
)
{
rmin2
=
rmin1
;
rmin1
=
Q_abs
;
idx_min
=
i
;
}
else
if
(
Q_abs
<
rmin2
)
{
rmin2
=
Q_abs
;
}
}
//
if
(
blockIdx.x
==
0
&&
threadIdx.x
==
1
)
printf
(
"min1 %d, min2 %d, min1_idx %d\n"
,
rmin1,
rmin2,
idx_min
)
;
//
The
2nd
recursion
for
(
int
i
=
0
; i < s; i ++){
sq
=
1
-
2
*
((
Q_sign
>>
i
)
&
0x01
)
;
R_temp
=
0.75f
*
sign
*
sq
*
(
i
!=
idx_min
?
rmin1
:
rmin2
)
;
//
write
results
to
global
memory
h_element_t
=
dev_h_compact1[i*row+iBlkRow]
;
int
addr_temp
=
h_element_t.y
*
row
*
Zc
+
offsetR
;
dev_dt[addr_temp]
=
R_temp
;
//
if
(
blockIdx.x
==
0
&&
threadIdx.x
==
1
)
printf
(
"R_temp %d, temp_addr %d\n"
,
R_temp,
addr_temp
)
;
}
}
//
Kernel
2:
VNP
processing
__kernel
void
ldpc_vnp_kernel_normal
(
/*char
*
dev_llr,
char
*
dev_dt,
char
*
dev_const_llr,*/
int
BG,
int
row,
int
col,
int
Zc
)
{
int
iMCW
=
blockIdx.y
;
int
iBlkCol
=
blockIdx.x
;
int
iBlkRow
;
int
iSubCol
=
threadIdx.x
;
int
iRow
;
int
iCol
;
int
shift_t,
sf
;
int
APP
;
h_element
h_element_t
;
//
update
all
the
llr
values
iCol
=
iBlkCol
*
Zc
+
iSubCol
;
APP
=
dev_const_llr[iMCW
*col*
Zc
+
iCol]
;
int
offsetDt
=
iMCW
*row*col*
Zc
+
iBlkCol
*
row
*
Zc
;
int
s
=
(
BG==1
)
?
h_ele_col_bg1_count[iBlkCol]:h_ele_col_bg2_count[iBlkCol]
;
for
(
int
i
=
0
; i < s; i++)
{
h_element_t
=
dev_h_compact2[i*col+iBlkCol]
;
shift_t
=
h_element_t.value%Zc
;
iBlkRow
=
h_element_t.x
;
sf
=
iSubCol
-
shift_t
;
sf
=
(
sf
+
Zc
)
%
Zc
;
iRow
=
iBlkRow
*
Zc
+
sf
;
APP
=
APP
+
dev_dt[offsetDt
+
iRow]
;
}
if
(
APP
>
SCHAR_MAX
)
APP
=
SCHAR_MAX
;
if
(
APP
<
SCHAR_MIN
)
APP
=
SCHAR_MIN
;
//
write
back
to
device
global
memory
dev_llr[iMCW
*col*
Zc
+
iCol]
=
APP
;
}
__kernel
void
pack_decoded_bit
(
/*char
*dev,
unsigned
char
*host,*
/
int
col,
int
Zc
)
{
__shared__
unsigned
char
tmp[128]
;
int
iMCW
=
blockIdx.y
;
int
tid
=
iMCW
*
col*Zc
+
blockIdx.x*128
+
threadIdx.x
;
int
btid
=
threadIdx.x
;
tmp[btid]
=
0
;
if
(
dev_llr[tid]
<
0
)
{
tmp[btid]
=
1
<<
(
7-
(
btid&7
))
;
}
__syncthreads
()
;
if
(
threadIdx.x
<
16
)
{
dev_tmp[iMCW
*
col*Zc
+
blockIdx.x*16+threadIdx.x]
=
0
;
for
(
int
i
=
0
; i < 8; i++){
dev_tmp[iMCW
*
col*Zc
+
blockIdx.x*16+threadIdx.x]
+=
tmp[threadIdx.x*8+i]
;
}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment