Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
O
OpenXG-RAN
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
wangjie
OpenXG-RAN
Commits
21c5e485
Commit
21c5e485
authored
Nov 21, 2019
by
tyhsu
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
v3. decrease GMEM access in beamforming precode
parent
6a271962
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
77 additions
and
53 deletions
+77
-53
openair1/CUDA/CUDA_phy_procedure.cu
openair1/CUDA/CUDA_phy_procedure.cu
+58
-50
openair1/CUDA/cuda_struct.h
openair1/CUDA/cuda_struct.h
+2
-1
openair1/CUDA/init_cuda.cu
openair1/CUDA/init_cuda.cu
+11
-2
openair1/SCHED_NR/nr_ru_procedures.c
openair1/SCHED_NR/nr_ru_procedures.c
+6
-0
No files found.
openair1/CUDA/CUDA_phy_procedure.cu
View file @
21c5e485
...
...
@@ -116,7 +116,7 @@ extern "C" void CUDA_ifft_ofdm( int **output,
int
threadNum
=
1024
;
int
blockNum
=
fftsize
*
nb_symbols
*
nb_tx
/
threadNum
;
cu_intToComplex
<<<
blockNum
,
threadNum
>>>
(
d_txdataF_BF
,
d_signal
);
//
cu_intToComplex<<<blockNum, threadNum>>>(d_txdataF_BF, d_signal);
//CHECK_STATE("cu_intToComplex");
cufftErrchk
(
cufftExecC2C
(
plan
,
(
cufftComplex
*
)
d_signal
,
(
cufftComplex
*
)
d_signal
,
CUFFT_INVERSE
));
...
...
@@ -144,70 +144,78 @@ extern "C" void CUDA_ifft_ofdm( int **output,
}
__device__
inline
void
beamComp
(
int
*
res
,
int
*
x1
,
int
*
x2
){
((
short
*
)
res
)[
0
]
=
((
short
*
)
x1
)[
0
]
*
((
short
*
)
x2
)[
0
]
+
((
short
*
)
x1
)[
1
]
*
((
short
*
)
x2
)[
1
];
((
short
*
)
res
)[
1
]
=
((
short
*
)
x1
)[
0
]
*
((
short
*
)
x2
)[
1
]
-
((
short
*
)
x1
)[
1
]
*
((
short
*
)
x2
)[
0
];
((
short
*
)
res
)[
0
]
+
=
((
short
*
)
x1
)[
0
]
*
((
short
*
)
x2
)[
0
]
+
((
short
*
)
x1
)[
1
]
*
((
short
*
)
x2
)[
1
];
((
short
*
)
res
)[
1
]
+
=
((
short
*
)
x1
)[
0
]
*
((
short
*
)
x2
)[
1
]
-
((
short
*
)
x1
)[
1
]
*
((
short
*
)
x2
)[
0
];
}
__global__
void
conjMulAll
(
int
*
txdataF
,
int
*
weight
,
int
*
res
,
extern
__constant__
int
PORTSIZE
;
extern
__constant__
int
SUBTXSIZE
;
extern
__constant__
int
BW_PSIZE
;
__global__
void
conjMulAll
(
int
*
txdataF
,
int
*
weight
,
int
*
sub
,
int
fftsize
,
int
nb_symbols
,
int
nb_tx
,
int
nb_antenna_ports
){
__shared__
int
x1
[
2048
*
5
];
int
symbSart
=
blockIdx
.
x
*
5
;
int
portId
=
blockIdx
.
y
;
__shared__
int
x1
[
2048
*
4
];
__shared__
int
res
[
2048
]
;
int
id
=
threadIdx
.
x
;
int
id2
=
id
+
1024
;
int
aaSize
=
nb_antenna_ports
*
nb_symbols
*
fftsize
;
int
portSize
=
nb_symbols
*
fftsize
;
int
symbId
=
blockIdx
.
x
;
int
portStart
=
blockIdx
.
y
*
4
;
int
subtxId
=
blockIdx
.
y
;
int
s1
=
0
;
for
(
int
symbId
=
symbSart
;
symbId
<
(
symbSart
+
5
)
&&
symbId
<
nb_symbols
;
symbId
++
){
x1
[
s1
*
fftsize
+
id
]
=
txdataF
[
symbId
*
fftsize
+
id
];
x1
[
s1
*
fftsize
+
id2
]
=
txdataF
[
symbId
*
fftsize
+
id2
];
int
s1
=
0
;
for
(
int
p
=
portStart
;
p
<
portStart
+
4
;
p
++
){
x1
[
s1
*
2048
+
id
]
=
txdataF
[
p
*
PORTSIZE
+
symbId
*
fftsize
+
id
];
x1
[
s1
*
2048
+
id2
]
=
txdataF
[
p
*
PORTSIZE
+
symbId
*
fftsize
+
id2
];
s1
++
;
}
for
(
int
aa
=
0
;
aa
<
nb_tx
;
aa
++
){
for
(
int
symbId
=
symbSart
;
symbId
<
(
symbSart
+
5
)
&&
symbId
<
nb_symbols
;
symbId
++
){
int
resId
=
aa
*
aaSize
+
portId
*
portSize
+
symbId
*
fftsize
;
s1
=
symbId
%
5
;
beamComp
(
&
res
[
resId
+
id
],
&
x1
[
s1
*
fftsize
+
id
],
&
weight
[
portId
*
(
nb_tx
*
fftsize
)
+
aa
*
fftsize
+
id
]);
beamComp
(
&
res
[
resId
+
id2
],
&
x1
[
s1
*
fftsize
+
id2
],
&
weight
[
portId
*
(
nb_tx
*
fftsize
)
+
aa
*
fftsize
+
id2
]);
res
[
id
]
=
0
;
res
[
id2
]
=
0
;
s1
=
0
;
for
(
int
p
=
portStart
;
p
<
portStart
+
4
;
p
++
){
beamComp
(
&
res
[
id
],
&
x1
[
s1
*
2048
+
id
],
&
weight
[
p
*
BW_PSIZE
+
aa
*
fftsize
+
id
]);
beamComp
(
&
res
[
id2
],
&
x1
[
s1
*
2048
+
id2
],
&
weight
[
p
*
BW_PSIZE
+
aa
*
fftsize
+
id2
]);
/*
if(id==0){
printf("%5d+%5di mul %5d+%5di = %5d+%5di\n",
((short*)&x1[s1*2048+id])[0], ((short*)&x1[s1*2048+id])[1],
((short*)&weight[p*BW_PSIZE+aa*fftsize+id])[0],((short*)&weight[p*BW_PSIZE+aa*fftsize+id])[1],
((short*)&res[id])[0], ((short*)&res[id])[1]);
}*/
s1
++
;
}
}
int
offset
=
subtxId
*
SUBTXSIZE
+
aa
*
PORTSIZE
+
symbId
*
fftsize
;
sub
[
offset
+
id
]
=
res
[
id
];
sub
[
offset
+
id2
]
=
res
[
id2
];
}
}
__device__
inline
void
partAdd
(
int
*
res
,
int
*
x
){
((
short
*
)
res
)[
0
]
+=
((
short
*
)
x
)[
0
];
((
short
*
)
res
)[
1
]
+=
((
short
*
)
x
)[
1
];
__device__
inline
void
partAdd
(
Complex
*
res
,
int
*
x1
,
int
*
x2
){
res
->
x
=
((
short
*
)
x1
)[
0
]
+
((
short
*
)
x2
)[
0
];
res
->
y
=
((
short
*
)
x1
)[
1
]
+
((
short
*
)
x2
)[
1
];
}
__global__
void
combine
(
int
*
res
,
int
*
txdataF_BF
,
int
fftsize
,
int
nb_symbols
,
int
nb_tx
,
int
nb_antenna_ports
){
__shared__
int
buf
[
2048
*
5
];
int
symbStart
=
blockIdx
.
x
;
int
txId
=
blockIdx
.
y
;
__global__
void
combine
(
int
*
subtx
,
Complex
*
d_signal
,
int
fftsize
,
int
nb_symbols
,
int
nb_tx
,
int
nb_antenna_ports
){
int
id
=
threadIdx
.
x
;
int
id2
=
id
+
1024
;
int
txSize
=
nb_antenna_ports
*
nb_symbols
*
fftsize
;
int
portSize
=
nb_symbols
*
fftsize
;
int
s1
=
0
;
for
(
int
p
=
0
;
p
<
nb_antenna_ports
;
p
++
){
for
(
int
symbId
=
symbStart
;
symbId
<
symbId
+
5
&&
symbId
<
nb_symbols
;
symbId
++
){
s1
=
symbId
%
5
;
partAdd
(
&
buf
[
s1
+
id
],
&
res
[
txId
*
txSize
+
p
*
portSize
+
symbId
*
fftsize
+
id
]);
partAdd
(
&
buf
[
s1
+
id2
],
&
res
[
txId
*
txSize
+
p
*
portSize
+
symbId
*
fftsize
+
id2
]);
}
}
for
(
int
symbId
=
symbStart
;
symbId
<
symbId
+
5
&&
symbId
<
nb_symbols
;
symbId
++
){
s1
=
symbId
%
5
;
res
[
txId
*
nb_symbols
*
fftsize
+
symbId
*
fftsize
+
id
]
=
buf
[
s1
*
fftsize
+
id
];
res
[
txId
*
nb_symbols
*
fftsize
+
symbId
*
fftsize
+
id2
]
=
buf
[
s1
*
fftsize
+
id2
];
int
aa
=
blockIdx
.
x
;
int
symbStart
=
blockIdx
.
y
*
7
;
int
symbEnd
=
symbStart
+
7
;
for
(
int
symb
=
symbStart
;
symb
<
symbEnd
;
symb
++
){
int
offset
=
aa
*
PORTSIZE
+
symb
*
fftsize
;
partAdd
(
&
d_signal
[
offset
+
id
],
&
subtx
[
offset
+
id
],
&
subtx
[
SUBTXSIZE
+
offset
+
id
]);
partAdd
(
&
d_signal
[
offset
+
id2
],
&
subtx
[
offset
+
id2
],
&
subtx
[
SUBTXSIZE
+
offset
+
id2
]);
//if(id==0) printf("%5.5f+%5.5fi\n", d_signal[offset+id].x, d_signal[offset+id].y);
}
}
extern
"C"
void
CUDA_beam_precoding
(
int
**
txdataF
,
int
***
weight
,
int
L_ssb
,
int
shift
,
int
fftsize
,
int
nb_symbols
,
int
nb_antenna_ports
,
int
nb_tx
){
cudaEvent_t
start
,
stop
;
float
time
;
...
...
@@ -216,6 +224,7 @@ extern "C" void CUDA_beam_precoding(int **txdataF, int ***weight, int L_ssb, int
//initial BF data;
gpuErrchk
(
cudaMemset
(
cu_ru
.
d_txdataF_BF
,
0
,
fftsize
*
nb_symbols
*
sizeof
(
int
)
*
nb_tx
)
);
gpuErrchk
(
cudaMemset
(
cu_ru
.
d_subtx
,
0
,
fftsize
*
nb_symbols
*
nb_tx
*
2
*
sizeof
(
int
))
);
//move data to gpu
int
slotsize
=
fftsize
*
nb_symbols
;
for
(
int
p
=
0
;
p
<
nb_antenna_ports
;
p
++
){
...
...
@@ -223,8 +232,6 @@ extern "C" void CUDA_beam_precoding(int **txdataF, int ***weight, int L_ssb, int
}
cudaEventRecord
(
start
);
int
threadNum
=
1024
;
int
blockNum
=
fftsize
*
nb_symbols
/
threadNum
;
int
div
=
1
<<
shift
;
for
(
int
aa
=
0
;
aa
<
nb_tx
;
aa
++
){
for
(
int
p
=
0
;
p
<
nb_antenna_ports
;
p
++
){
...
...
@@ -241,11 +248,12 @@ extern "C" void CUDA_beam_precoding(int **txdataF, int ***weight, int L_ssb, int
cudaEventRecord
(
start
);
dim3
block
(
3
,
8
);
dim3
block
(
14
,
2
,
1
);
dim3
thread
(
1024
);
conjMulAll
<<<
block
,
thread
>>>
(
cu_ru
.
d_txdataF
,
cu_ru
.
d_weight
,
cu_ru
.
d_
res
,
conjMulAll
<<<
block
,
thread
>>>
(
cu_ru
.
d_txdataF
,
cu_ru
.
d_weight
,
cu_ru
.
d_
subtx
,
fftsize
,
nb_symbols
,
nb_tx
,
nb_antenna_ports
);
combine
<<<
block
,
thread
>>>
(
cu_ru
.
d_res
,
cu_ru
.
d_txdataF_BF
,
block
=
dim3
(
8
,
2
,
1
);
combine
<<<
block
,
thread
>>>
(
cu_ru
.
d_subtx
,
cu_ru
.
d_signal
,
fftsize
,
nb_symbols
,
nb_tx
,
nb_antenna_ports
);
cudaEventRecord
(
stop
);
...
...
openair1/CUDA/cuda_struct.h
View file @
21c5e485
...
...
@@ -15,7 +15,7 @@ typedef struct cuda_cu_ru_t{
//beamforming precoding
int
*
d_txdataF
;
//14symb-port0, 14symb-port1, ......
int
*
d_weight
;
//[p * tx * fftsize]
int
*
d_
res
;
int
*
d_
subtx
;
//14symb-subport0, 14symb-subport1, ..., 14symb-subport0, 14symb-subport1, ...
//ifft
int
*
d_txdataF_BF
;
//14symb-tx0, 14symb-tx1, ......
...
...
@@ -26,6 +26,7 @@ typedef struct cuda_cu_ru_t{
extern
cuda_cu_ru
cu_ru
;
#if __cplusplus
}
#endif
...
...
openair1/CUDA/init_cuda.cu
View file @
21c5e485
...
...
@@ -41,6 +41,10 @@
cuda_cu_ru
cu_ru
;
__constant__
int
PORTSIZE
;
__constant__
int
SUBTXSIZE
;
__constant__
int
BW_PSIZE
;
extern
"C"
void
init_cuda
(
int
nb_tx
,
int
nb_symbols
,
int
fftsize
){
printf
(
"init_cuda %d %d %d
\n\n\n
"
,
nb_tx
,
nb_symbols
,
fftsize
);
...
...
@@ -49,7 +53,7 @@ extern "C" void init_cuda(int nb_tx, int nb_symbols, int fftsize){
//beamforming precoding
gpuErrchk
(
cudaMalloc
((
void
**
)
&
cu_ru
.
d_txdataF
,
sizeof
(
int
)
*
nb_tx
*
nb_antenna_ports
*
nb_symbols
*
fftsize
)
);
gpuErrchk
(
cudaMalloc
((
void
**
)
&
cu_ru
.
d_weight
,
sizeof
(
int
)
*
nb_tx
*
nb_antenna_ports
*
fftsize
)
);
gpuErrchk
(
cudaMalloc
((
void
**
)
&
cu_ru
.
d_
res
,
sizeof
(
int
)
*
nb_tx
*
nb_antenna_ports
*
fftsize
*
nb_symbols
)
);
gpuErrchk
(
cudaMalloc
((
void
**
)
&
cu_ru
.
d_
subtx
,
sizeof
(
int
)
*
nb_tx
*
fftsize
*
nb_symbols
*
2
)
);
//ifft
gpuErrchk
(
cudaMalloc
((
void
**
)
&
cu_ru
.
d_txdataF_BF
,
fftsize
*
sizeof
(
int
)
*
nb_symbols
*
nb_tx
)
);
...
...
@@ -57,6 +61,11 @@ extern "C" void init_cuda(int nb_tx, int nb_symbols, int fftsize){
gpuErrchk
(
cudaMalloc
((
void
**
)
&
cu_ru
.
d_data_wCP
,
fftsize
*
(
nb_symbols
+
1
)
*
nb_tx
*
sizeof
(
int
))
);
cufftErrchk
(
cufftPlan1d
(
&
cu_ru
.
plan
,
fftsize
,
CUFFT_C2C
,
nb_symbols
*
nb_tx
)
);
int
portSize
=
fftsize
*
nb_symbols
;
int
subtxsize
=
nb_tx
*
nb_symbols
*
fftsize
;
int
bw_psize
=
nb_tx
*
fftsize
;
gpuErrchk
(
cudaMemcpyToSymbol
(
PORTSIZE
,
&
portSize
,
sizeof
(
int
))
);
gpuErrchk
(
cudaMemcpyToSymbol
(
SUBTXSIZE
,
&
subtxsize
,
sizeof
(
int
))
);
gpuErrchk
(
cudaMemcpyToSymbol
(
BW_PSIZE
,
&
bw_psize
,
sizeof
(
int
))
);
}
openair1/SCHED_NR/nr_ru_procedures.c
View file @
21c5e485
...
...
@@ -131,6 +131,12 @@ void CUDA_prec_ofdm(RU_t *ru,int frame_tx,int tti_tx){
((
short
*
)
&
ru
->
common
.
txdataF
[
p
][
j
])[
0
]
=
1
;
((
short
*
)
&
ru
->
common
.
txdataF
[
p
][
j
])[
1
]
=
1
;
}
for
(
int
aa
=
0
;
aa
<
ru
->
nb_tx
;
aa
++
){
for
(
int
j
=
0
;
j
<
fp
->
ofdm_symbol_size
;
j
++
){
((
short
*
)
&
ru
->
beam_weights
[
0
][
aa
][
p
][
j
])[
0
]
=
2
;
((
short
*
)
&
ru
->
beam_weights
[
0
][
aa
][
p
][
j
])[
1
]
=
2
;
}
}
}
clock_t
start
,
end
;
start
=
clock
();
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment