Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
O
OpenXG-RAN
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
lizhongxiao
OpenXG-RAN
Commits
96023eeb
Commit
96023eeb
authored
Aug 06, 2020
by
NCTU OpinConnect Terng-Yin Hsu/WEI-YING,LIN
Browse files
Options
Browse Files
Download
Plain Diff
Fix CUDA latency
parents
61bdea75
f7dc17ec
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
221 additions
and
0 deletions
+221
-0
openair1/PHY/CODING/TESTBENCH/ldpctest.c
openair1/PHY/CODING/TESTBENCH/ldpctest.c
+8
-0
openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.cu
openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.cu
+210
-0
openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.h
openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.h
+3
-0
No files found.
openair1/PHY/CODING/TESTBENCH/ldpctest.c
View file @
96023eeb
...
...
@@ -402,9 +402,17 @@ int test_ldpc(short No_iteration,
start_meas
(
time_decoder
);
#ifdef CUDA_FLAG
if
(
run_cuda
){
<<<<<<<
HEAD
n_iter
=
nrLDPC_decoder_LYC
(
&
decParams
,
(
int8_t
*
)
channel_output_fixed
[
j
],
(
int8_t
*
)
estimated_output
[
j
],
block_length
,
time_decoder
);
}
else
{
=======
printf
(
"***********run ldpc by cuda
\n
"
);
n_iter
=
nrLDPC_decoder_LYC
(
&
decParams
,
(
int8_t
*
)
channel_output_fixed
[
j
],
(
int8_t
*
)
estimated_output
[
j
],
block_length
,
time_decoder
);
}
else
{
printf
(
"**************run ldpc by cpu
\n
"
);
>>>>>>>
origin
/
develop
// decode the sequence
// decoder supports BG2, Z=128 & 256
//esimated_output=ldpc_decoder(channel_output_fixed, block_length, No_iteration, (double)((float)nom_rate/(float)denom_rate));
...
...
openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.cu
View file @
96023eeb
...
...
@@ -32,7 +32,11 @@
#include "bgs/BG2_I6"
#include "bgs/BG2_I7"
<<<<<<<
HEAD
#define MAX_ITERATION 2
=======
#define MAX_ITERATION 5
>>>>>>>
origin
/
develop
#define MC 1
#define cudaCheck(ans) { cudaAssert((ans), __FILE__, __LINE__); }
...
...
@@ -49,21 +53,30 @@ typedef struct{
char
y
;
short
value
;
}
h_element
;
<<<<<<<
HEAD
#include "bgs/BG1_compact_in_C.h"
__device__
char
dev_const_llr
[
68
*
384
];
__device__
char
dev_dt
[
46
*
68
*
384
];
__device__
char
dev_llr
[
68
*
384
];
__device__
unsigned
char
dev_tmp
[
68
*
384
];
=======
>>>>>>>
origin
/
develop
h_element
h_compact1
[
46
*
19
]
=
{};
h_element
h_compact2
[
68
*
30
]
=
{};
<<<<<<<
HEAD
__device__
h_element
dev_h_compact1
[
46
*
19
];
// used in kernel 1
__device__
h_element
dev_h_compact2
[
68
*
30
];
// used in kernel 2
// __device__ __constant__ h_element dev_h_compact1[46*19]; // used in kernel 1
// __device__ __constant__ h_element dev_h_compact2[68*30]; // used in kernel 2
=======
__device__
__constant__
h_element
dev_h_compact1
[
46
*
19
];
// used in kernel 1
__device__
__constant__
h_element
dev_h_compact2
[
68
*
30
];
// used in kernel 2
>>>>>>>
origin
/
develop
// row and col element count
__device__
__constant__
char
h_ele_row_bg1_count
[
46
]
=
{
...
...
@@ -100,6 +113,7 @@ __global__ void warmup()
// warm up gpu for time measurement
}
<<<<<<<
HEAD
extern
"C"
void
warmup_for_GPU
(){
...
...
@@ -187,6 +201,11 @@ void set_compact_BG(int Zc,short BG){
// Kernel 1
__global__
void
ldpc_cnp_kernel_1st_iter
(
/*char * dev_llr,*/
int
BG
,
int
row
,
int
col
,
int
Zc
)
=======
// Kernel 1
__global__
void
ldpc_cnp_kernel_1st_iter
(
char
*
dev_llr
,
char
*
dev_dt
,
int
BG
,
int
row
,
int
col
,
int
Zc
)
>>>>>>>
origin
/
develop
{
// if(blockIdx.x == 0 && threadIdx.x == 1) printf("cnp %d\n", threadIdx.x)
;
int
iMCW
=
blockIdx
.
y
;
// codeword id
...
...
@@ -245,7 +264,11 @@ __global__ void ldpc_cnp_kernel_1st_iter(/*char * dev_llr,*/ int BG, int row, in
for
(
int
i
=
0
;
i
<
s
;
i
++
){
// v0: Best performance so far. 0.75f is the value of alpha.
sq
=
1
-
2
*
((
Q_sign
>>
i
)
&
0x01
);
<<<<<<<
HEAD
R_temp
=
0.75
f
*
sign
*
sq
*
(
i
!=
idx_min
?
rmin1
:
rmin2
);
=======
R_temp
=
0.8
*
sign
*
sq
*
(
i
!=
idx_min
?
rmin1
:
rmin2
);
>>>>>>>
origin
/
develop
// write results to global memory
h_element_t
=
dev_h_compact1
[
i
*
row
+
iBlkRow
];
int
addr_temp
=
offsetR
+
h_element_t
.
y
*
row
*
Zc
;
...
...
@@ -255,7 +278,11 @@ __global__ void ldpc_cnp_kernel_1st_iter(/*char * dev_llr,*/ int BG, int row, in
}
// Kernel_1
<<<<<<<
HEAD
__global__
void
ldpc_cnp_kernel
(
/*char * dev_llr, char * dev_dt,*/
int
BG
,
int
row
,
int
col
,
int
Zc
)
=======
__global__
void
ldpc_cnp_kernel
(
char
*
dev_llr
,
char
*
dev_dt
,
int
BG
,
int
row
,
int
col
,
int
Zc
)
>>>>>>>
origin
/
develop
{
// if(blockIdx.x == 0 && threadIdx.x == 1) printf("cnp\n")
;
int
iMCW
=
blockIdx
.
y
;
...
...
@@ -315,7 +342,11 @@ __global__ void ldpc_cnp_kernel(/*char * dev_llr, char * dev_dt,*/ int BG, int r
// The 2nd recursion
for
(
int
i
=
0
;
i
<
s
;
i
++
){
sq
=
1
-
2
*
((
Q_sign
>>
i
)
&
0x01
);
<<<<<<<
HEAD
R_temp
=
0.75
f
*
sign
*
sq
*
(
i
!=
idx_min
?
rmin1
:
rmin2
);
=======
R_temp
=
0.8
*
sign
*
sq
*
(
i
!=
idx_min
?
rmin1
:
rmin2
);
>>>>>>>
origin
/
develop
// write results to global memory
...
...
@@ -328,7 +359,11 @@ __global__ void ldpc_cnp_kernel(/*char * dev_llr, char * dev_dt,*/ int BG, int r
// Kernel 2: VNP processing
__global__
void
<<<<<<<
HEAD
ldpc_vnp_kernel_normal
(
/*char * dev_llr, char * dev_dt, char * dev_const_llr,*/
int
BG
,
int
row
,
int
col
,
int
Zc
)
=======
ldpc_vnp_kernel_normal
(
char
*
dev_llr
,
char
*
dev_dt
,
char
*
dev_const_llr
,
int
BG
,
int
row
,
int
col
,
int
Zc
)
>>>>>>>
origin
/
develop
{
int
iMCW
=
blockIdx
.
y
;
int
iBlkCol
=
blockIdx
.
x
;
...
...
@@ -368,7 +403,11 @@ ldpc_vnp_kernel_normal(/*char * dev_llr, char * dev_dt, char * dev_const_llr,*/
}
<<<<<<<
HEAD
__global__
void
pack_decoded_bit
(
/*char *dev, unsigned char *host,*/
int
col
,
int
Zc
)
=======
__global__
void
pack_decoded_bit
(
char
*
dev
,
unsigned
char
*
host
,
int
col
,
int
Zc
)
>>>>>>>
origin
/
develop
{
__shared__
unsigned
char
tmp
[
128
];
int
iMCW
=
blockIdx
.
y
;
...
...
@@ -376,15 +415,25 @@ __global__ void pack_decoded_bit(/*char *dev, unsigned char *host,*/ int col, in
int
btid
=
threadIdx
.
x
;
tmp
[
btid
]
=
0
;
<<<<<<<
HEAD
if
(
dev_llr
[
tid
]
<
0
){
=======
if
(
dev
[
tid
]
<
0
){
>>>>>>>
origin
/
develop
tmp
[
btid
]
=
1
<<
(
7
-
(
btid
&
7
));
}
__syncthreads
();
if
(
threadIdx
.
x
<
16
){
<<<<<<<
HEAD
dev_tmp
[
iMCW
*
col
*
Zc
+
blockIdx
.
x
*
16
+
threadIdx
.
x
]
=
0
;
for
(
int
i
=
0
;
i
<
8
;
i
++
){
dev_tmp
[
iMCW
*
col
*
Zc
+
blockIdx
.
x
*
16
+
threadIdx
.
x
]
+=
tmp
[
threadIdx
.
x
*
8
+
i
];
=======
host
[
iMCW
*
col
*
Zc
+
blockIdx
.
x
*
16
+
threadIdx
.
x
]
=
0
;
for
(
int
i
=
0
;
i
<
8
;
i
++
){
host
[
iMCW
*
col
*
Zc
+
blockIdx
.
x
*
16
+
threadIdx
.
x
]
+=
tmp
[
threadIdx
.
x
*
8
+
i
];
>>>>>>>
origin
/
develop
}
}
}
...
...
@@ -461,6 +510,7 @@ void read_BG(int BG, int *h, int row, int col)
*/
}
<<<<<<<
HEAD
extern
"C"
void
init_LLR_DMA_for_CUDA
(
t_nrLDPC_dec_params
*
p_decParams
,
int8_t
*
p_llr
,
int8_t
*
p_out
,
int
block_length
){
...
...
@@ -482,17 +532,28 @@ void init_LLR_DMA_for_CUDA(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8
cudaDeviceSynchronize
();
}
=======
>>>>>>>
origin
/
develop
extern
"C"
int32_t
nrLDPC_decoder_LYC
(
t_nrLDPC_dec_params
*
p_decParams
,
int8_t
*
p_llr
,
int8_t
*
p_out
,
int
block_length
,
time_stats_t
*
time_decoder
)
{
<<<<<<<
HEAD
=======
// alloc mem
//unsigned char *decision = (unsigned char*)p_out;
>>>>>>>
origin
/
develop
uint16_t
Zc
=
p_decParams
->
Z
;
uint8_t
BG
=
p_decParams
->
BG
;
uint8_t
numMaxIter
=
p_decParams
->
numMaxIter
;
e_nrLDPC_outMode
outMode
=
p_decParams
->
outMode
;
<<<<<<<
HEAD
cudaError_t
cudaStatus
;
=======
>>>>>>>
origin
/
develop
uint8_t
row
,
col
;
if
(
BG
==
1
){
row
=
46
;
...
...
@@ -502,6 +563,7 @@ int32_t nrLDPC_decoder_LYC(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8
row
=
42
;
col
=
52
;
}
<<<<<<<
HEAD
// alloc memory
unsigned
char
*
hard_decision
=
(
unsigned
char
*
)
p_out
;
...
...
@@ -510,6 +572,98 @@ int32_t nrLDPC_decoder_LYC(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8
cudaCheck
(
cudaMemcpyToSymbol
(
dev_const_llr
,
p_llr
,
memorySize_llr_cuda
)
);
cudaCheck
(
cudaMemcpyToSymbol
(
dev_llr
,
p_llr
,
memorySize_llr_cuda
)
);
=======
int
compact_row
=
30
,
compact_col
=
19
,
lift_index
=
0
;;
if
(
BG
==
2
){
compact_row
=
10
,
compact_col
=
23
;}
short
lift_set
[][
9
]
=
{
{
2
,
4
,
8
,
16
,
32
,
64
,
128
,
256
},
{
3
,
6
,
12
,
24
,
48
,
96
,
192
,
384
},
{
5
,
10
,
20
,
40
,
80
,
160
,
320
},
{
7
,
14
,
28
,
56
,
112
,
224
},
{
9
,
18
,
36
,
72
,
144
,
288
},
{
11
,
22
,
44
,
88
,
176
,
352
},
{
13
,
26
,
52
,
104
,
208
},
{
15
,
30
,
60
,
120
,
240
},
{
0
}
};
for
(
int
i
=
0
;
lift_set
[
i
][
0
]
!=
0
;
i
++
){
for
(
int
j
=
0
;
lift_set
[
i
][
j
]
!=
0
;
j
++
){
if
(
Zc
==
lift_set
[
i
][
j
]){
lift_index
=
i
;
break
;
}
}
}
int
*
h
=
NULL
;
switch
(
lift_index
){
case
0
:
h
=
(
BG
==
1
)
?
h_base_0
:
h_base_8
;
break
;
case
1
:
h
=
(
BG
==
1
)
?
h_base_1
:
h_base_9
;
break
;
case
2
:
h
=
(
BG
==
1
)
?
h_base_2
:
h_base_10
;
break
;
case
3
:
h
=
(
BG
==
1
)
?
h_base_3
:
h_base_11
;
break
;
case
4
:
h
=
(
BG
==
1
)
?
h_base_4
:
h_base_12
;
break
;
case
5
:
h
=
(
BG
==
1
)
?
h_base_5
:
h_base_13
;
break
;
case
6
:
h
=
(
BG
==
1
)
?
h_base_6
:
h_base_14
;
break
;
case
7
:
h
=
(
BG
==
1
)
?
h_base_7
:
h_base_15
;
break
;
}
/* pack BG in compact graph */
read_BG
(
BG
,
h
,
row
,
col
);
int
memorySize_h_compact1
=
row
*
compact_col
*
sizeof
(
h_element
);
int
memorySize_h_compact2
=
compact_row
*
col
*
sizeof
(
h_element
);
// cpu
int
memorySize_hard_decision
=
col
*
Zc
*
sizeof
(
unsigned
char
)
*
MC
;
// alloc memory
unsigned
char
*
hard_decision
=
(
unsigned
char
*
)
p_out
;
// gpu
int
memorySize_llr_cuda
=
col
*
Zc
*
sizeof
(
char
)
*
MC
;
int
memorySize_dt_cuda
=
row
*
Zc
*
col
*
sizeof
(
char
)
*
MC
;
// alloc memory
char
*
dev_llr
;
char
*
dev_dt
;
char
*
dev_const_llr
;
unsigned
char
*
dev_tmp
;
cudaCheck
(
cudaMalloc
((
void
**
)
&
dev_tmp
,
memorySize_hard_decision
)
);
cudaCheck
(
cudaMalloc
((
void
**
)
&
dev_llr
,
memorySize_llr_cuda
)
);
cudaCheck
(
cudaMalloc
((
void
**
)
&
dev_const_llr
,
memorySize_llr_cuda
)
);
cudaCheck
(
cudaMalloc
((
void
**
)
&
dev_dt
,
memorySize_dt_cuda
)
);
// memcpy host to device
cudaCheck
(
cudaMemcpyToSymbol
(
dev_h_compact1
,
h_compact1
,
memorySize_h_compact1
)
);
cudaCheck
(
cudaMemcpyToSymbol
(
dev_h_compact2
,
h_compact2
,
memorySize_h_compact2
)
);
cudaCheck
(
cudaMemcpy
((
void
*
)
dev_const_llr
,
p_llr
,
memorySize_llr_cuda
,
cudaMemcpyHostToDevice
)
);
start_meas
(
time_decoder
);
cudaCheck
(
cudaMemcpy
((
void
*
)
dev_llr
,
p_llr
,
memorySize_llr_cuda
,
cudaMemcpyHostToDevice
)
);
>>>>>>>
origin
/
develop
// Define CUDA kernel dimension
int
blockSizeX
=
Zc
;
dim3
dimGridKernel1
(
row
,
MC
,
1
);
// dim of the thread blocks
...
...
@@ -518,14 +672,33 @@ int32_t nrLDPC_decoder_LYC(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8
dim3
dimGridKernel2
(
col
,
MC
,
1
);
dim3
dimBlockKernel2
(
blockSizeX
,
1
,
1
);
cudaDeviceSynchronize
();
<<<<<<<
HEAD
// lauch kernel
=======
cudaEvent_t
start
,
end
;
float
time
;
warmup
<<<
dimGridKernel1
,
dimBlockKernel1
>>>
();
warmup
<<<
dimGridKernel2
,
dimBlockKernel2
>>>
();
cudaEventCreate
(
&
start
);
cudaEventCreate
(
&
end
);
cudaEventRecord
(
start
,
0
);
// cudaProfilerStart();
// lauch kernel
>>>>>>>
origin
/
develop
for
(
int
ii
=
0
;
ii
<
MAX_ITERATION
;
ii
++
){
// first kernel
if
(
ii
==
0
){
ldpc_cnp_kernel_1st_iter
<<<
dimGridKernel1
,
dimBlockKernel1
>>>
<<<<<<<
HEAD
(
/*dev_llr,*/
BG
,
row
,
col
,
Zc
);
}
else
{
ldpc_cnp_kernel
...
...
@@ -547,6 +720,43 @@ int32_t nrLDPC_decoder_LYC(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8
cudaDeviceSynchronize
();
=======
(
dev_llr
,
dev_dt
,
BG
,
row
,
col
,
Zc
);
}
else
{
ldpc_cnp_kernel
<<<
dimGridKernel1
,
dimBlockKernel1
>>>
(
dev_llr
,
dev_dt
,
BG
,
row
,
col
,
Zc
);
}
// second kernel
ldpc_vnp_kernel_normal
<<<
dimGridKernel2
,
dimBlockKernel2
>>>
(
dev_llr
,
dev_dt
,
dev_const_llr
,
BG
,
row
,
col
,
Zc
);
}
int
pack
=
(
block_length
/
128
)
+
1
;
dim3
pack_block
(
pack
,
MC
,
1
);
pack_decoded_bit
<<<
pack_block
,
128
>>>
(
dev_llr
,
dev_tmp
,
col
,
Zc
);
cudaEventRecord
(
end
,
0
);
cudaEventSynchronize
(
end
);
cudaEventElapsedTime
(
&
time
,
start
,
end
);
//cudaCheck( cudaMemcpy((*)hard_decision, (const void*)dev_tmp, memorySize_hard_decision, cudaMemcpyDeviceToHost) );
cudaCheck
(
cudaMemcpy
((
void
*
)
hard_decision
,
(
const
void
*
)
dev_tmp
,
(
block_length
/
8
)
*
sizeof
(
unsigned
char
),
cudaMemcpyDeviceToHost
)
);
cudaDeviceSynchronize
();
stop_meas
(
time_decoder
);
cudaCheck
(
cudaFree
(
dev_llr
)
);
cudaCheck
(
cudaFree
(
dev_dt
)
);
cudaCheck
(
cudaFree
(
dev_const_llr
)
);
cudaCheck
(
cudaFree
(
dev_tmp
)
);
//free(hard_decision);
>>>>>>>
origin
/
develop
return
MAX_ITERATION
;
}
openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.h
View file @
96023eeb
...
...
@@ -23,10 +23,13 @@
int32_t
nrLDPC_decoder_LYC
(
t_nrLDPC_dec_params
*
p_decParams
,
int8_t
*
p_llr
,
int8_t
*
p_out
,
int
block_length
,
time_stats_t
*
time_decoder
);
<<<<<<<
HEAD
void
init_LLR_DMA_for_CUDA
(
t_nrLDPC_dec_params
*
p_decParams
,
int8_t
*
p_llr
,
int8_t
*
p_out
,
int
block_length
);
void
warmup_for_GPU
(
void
);
void
set_compact_BG
(
int
Zc
,
short
BG
);
=======
>>>>>>>
origin
/
develop
#endif
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment