Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
O
OpenXG-RAN
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
promise
OpenXG-RAN
Commits
21c5e485
Commit
21c5e485
authored
5 years ago
by
tyhsu
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
v3. decrease GMEM access in beamforming precode
parent
6a271962
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
77 additions
and
53 deletions
+77
-53
openair1/CUDA/CUDA_phy_procedure.cu
openair1/CUDA/CUDA_phy_procedure.cu
+58
-50
openair1/CUDA/cuda_struct.h
openair1/CUDA/cuda_struct.h
+2
-1
openair1/CUDA/init_cuda.cu
openair1/CUDA/init_cuda.cu
+11
-2
openair1/SCHED_NR/nr_ru_procedures.c
openair1/SCHED_NR/nr_ru_procedures.c
+6
-0
No files found.
openair1/CUDA/CUDA_phy_procedure.cu
View file @
21c5e485
...
...
@@ -116,7 +116,7 @@ extern "C" void CUDA_ifft_ofdm( int **output,
int
threadNum
=
1024
;
int
blockNum
=
fftsize
*
nb_symbols
*
nb_tx
/
threadNum
;
cu_intToComplex
<<<
blockNum
,
threadNum
>>>
(
d_txdataF_BF
,
d_signal
);
//
cu_intToComplex<<<blockNum, threadNum>>>(d_txdataF_BF, d_signal);
//CHECK_STATE("cu_intToComplex");
cufftErrchk
(
cufftExecC2C
(
plan
,
(
cufftComplex
*
)
d_signal
,
(
cufftComplex
*
)
d_signal
,
CUFFT_INVERSE
));
...
...
@@ -144,70 +144,78 @@ extern "C" void CUDA_ifft_ofdm( int **output,
}
__device__
inline
void
beamComp
(
int
*
res
,
int
*
x1
,
int
*
x2
){
((
short
*
)
res
)[
0
]
=
((
short
*
)
x1
)[
0
]
*
((
short
*
)
x2
)[
0
]
+
((
short
*
)
x1
)[
1
]
*
((
short
*
)
x2
)[
1
];
((
short
*
)
res
)[
1
]
=
((
short
*
)
x1
)[
0
]
*
((
short
*
)
x2
)[
1
]
-
((
short
*
)
x1
)[
1
]
*
((
short
*
)
x2
)[
0
];
((
short
*
)
res
)[
0
]
+
=
((
short
*
)
x1
)[
0
]
*
((
short
*
)
x2
)[
0
]
+
((
short
*
)
x1
)[
1
]
*
((
short
*
)
x2
)[
1
];
((
short
*
)
res
)[
1
]
+
=
((
short
*
)
x1
)[
0
]
*
((
short
*
)
x2
)[
1
]
-
((
short
*
)
x1
)[
1
]
*
((
short
*
)
x2
)[
0
];
}
__global__
void
conjMulAll
(
int
*
txdataF
,
int
*
weight
,
int
*
res
,
extern
__constant__
int
PORTSIZE
;
extern
__constant__
int
SUBTXSIZE
;
extern
__constant__
int
BW_PSIZE
;
__global__
void
conjMulAll
(
int
*
txdataF
,
int
*
weight
,
int
*
sub
,
int
fftsize
,
int
nb_symbols
,
int
nb_tx
,
int
nb_antenna_ports
){
__shared__
int
x1
[
2048
*
5
];
int
symbSart
=
blockIdx
.
x
*
5
;
int
portId
=
blockIdx
.
y
;
__shared__
int
x1
[
2048
*
4
];
__shared__
int
res
[
2048
]
;
int
id
=
threadIdx
.
x
;
int
id2
=
id
+
1024
;
int
aaSize
=
nb_antenna_ports
*
nb_symbols
*
fftsize
;
int
portSize
=
nb_symbols
*
fftsize
;
int
symbId
=
blockIdx
.
x
;
int
portStart
=
blockIdx
.
y
*
4
;
int
subtxId
=
blockIdx
.
y
;
int
s1
=
0
;
for
(
int
symbId
=
symbSart
;
symbId
<
(
symbSart
+
5
)
&&
symbId
<
nb_symbols
;
symbId
++
){
x1
[
s1
*
fftsize
+
id
]
=
txdataF
[
symbId
*
fftsize
+
id
];
x1
[
s1
*
fftsize
+
id2
]
=
txdataF
[
symbId
*
fftsize
+
id2
];
int
s1
=
0
;
for
(
int
p
=
portStart
;
p
<
portStart
+
4
;
p
++
){
x1
[
s1
*
2048
+
id
]
=
txdataF
[
p
*
PORTSIZE
+
symbId
*
fftsize
+
id
];
x1
[
s1
*
2048
+
id2
]
=
txdataF
[
p
*
PORTSIZE
+
symbId
*
fftsize
+
id2
];
s1
++
;
}
for
(
int
aa
=
0
;
aa
<
nb_tx
;
aa
++
){
for
(
int
symbId
=
symbSart
;
symbId
<
(
symbSart
+
5
)
&&
symbId
<
nb_symbols
;
symbId
++
){
int
resId
=
aa
*
aaSize
+
portId
*
portSize
+
symbId
*
fftsize
;
s1
=
symbId
%
5
;
beamComp
(
&
res
[
resId
+
id
],
&
x1
[
s1
*
fftsize
+
id
],
&
weight
[
portId
*
(
nb_tx
*
fftsize
)
+
aa
*
fftsize
+
id
]);
beamComp
(
&
res
[
resId
+
id2
],
&
x1
[
s1
*
fftsize
+
id2
],
&
weight
[
portId
*
(
nb_tx
*
fftsize
)
+
aa
*
fftsize
+
id2
]);
res
[
id
]
=
0
;
res
[
id2
]
=
0
;
s1
=
0
;
for
(
int
p
=
portStart
;
p
<
portStart
+
4
;
p
++
){
beamComp
(
&
res
[
id
],
&
x1
[
s1
*
2048
+
id
],
&
weight
[
p
*
BW_PSIZE
+
aa
*
fftsize
+
id
]);
beamComp
(
&
res
[
id2
],
&
x1
[
s1
*
2048
+
id2
],
&
weight
[
p
*
BW_PSIZE
+
aa
*
fftsize
+
id2
]);
/*
if(id==0){
printf("%5d+%5di mul %5d+%5di = %5d+%5di\n",
((short*)&x1[s1*2048+id])[0], ((short*)&x1[s1*2048+id])[1],
((short*)&weight[p*BW_PSIZE+aa*fftsize+id])[0],((short*)&weight[p*BW_PSIZE+aa*fftsize+id])[1],
((short*)&res[id])[0], ((short*)&res[id])[1]);
}*/
s1
++
;
}
}
int
offset
=
subtxId
*
SUBTXSIZE
+
aa
*
PORTSIZE
+
symbId
*
fftsize
;
sub
[
offset
+
id
]
=
res
[
id
];
sub
[
offset
+
id2
]
=
res
[
id2
];
}
}
__device__
inline
void
partAdd
(
int
*
res
,
int
*
x
){
((
short
*
)
res
)[
0
]
+=
((
short
*
)
x
)[
0
];
((
short
*
)
res
)[
1
]
+=
((
short
*
)
x
)[
1
];
__device__
inline
void
partAdd
(
Complex
*
res
,
int
*
x1
,
int
*
x2
){
res
->
x
=
((
short
*
)
x1
)[
0
]
+
((
short
*
)
x2
)[
0
];
res
->
y
=
((
short
*
)
x1
)[
1
]
+
((
short
*
)
x2
)[
1
];
}
__global__
void
combine
(
int
*
res
,
int
*
txdataF_BF
,
int
fftsize
,
int
nb_symbols
,
int
nb_tx
,
int
nb_antenna_ports
){
__shared__
int
buf
[
2048
*
5
];
int
symbStart
=
blockIdx
.
x
;
int
txId
=
blockIdx
.
y
;
__global__
void
combine
(
int
*
subtx
,
Complex
*
d_signal
,
int
fftsize
,
int
nb_symbols
,
int
nb_tx
,
int
nb_antenna_ports
){
int
id
=
threadIdx
.
x
;
int
id2
=
id
+
1024
;
int
txSize
=
nb_antenna_ports
*
nb_symbols
*
fftsize
;
int
portSize
=
nb_symbols
*
fftsize
;
int
s1
=
0
;
for
(
int
p
=
0
;
p
<
nb_antenna_ports
;
p
++
){
for
(
int
symbId
=
symbStart
;
symbId
<
symbId
+
5
&&
symbId
<
nb_symbols
;
symbId
++
){
s1
=
symbId
%
5
;
partAdd
(
&
buf
[
s1
+
id
],
&
res
[
txId
*
txSize
+
p
*
portSize
+
symbId
*
fftsize
+
id
]);
partAdd
(
&
buf
[
s1
+
id2
],
&
res
[
txId
*
txSize
+
p
*
portSize
+
symbId
*
fftsize
+
id2
]);
}
}
for
(
int
symbId
=
symbStart
;
symbId
<
symbId
+
5
&&
symbId
<
nb_symbols
;
symbId
++
){
s1
=
symbId
%
5
;
res
[
txId
*
nb_symbols
*
fftsize
+
symbId
*
fftsize
+
id
]
=
buf
[
s1
*
fftsize
+
id
];
res
[
txId
*
nb_symbols
*
fftsize
+
symbId
*
fftsize
+
id2
]
=
buf
[
s1
*
fftsize
+
id2
];
int
aa
=
blockIdx
.
x
;
int
symbStart
=
blockIdx
.
y
*
7
;
int
symbEnd
=
symbStart
+
7
;
for
(
int
symb
=
symbStart
;
symb
<
symbEnd
;
symb
++
){
int
offset
=
aa
*
PORTSIZE
+
symb
*
fftsize
;
partAdd
(
&
d_signal
[
offset
+
id
],
&
subtx
[
offset
+
id
],
&
subtx
[
SUBTXSIZE
+
offset
+
id
]);
partAdd
(
&
d_signal
[
offset
+
id2
],
&
subtx
[
offset
+
id2
],
&
subtx
[
SUBTXSIZE
+
offset
+
id2
]);
//if(id==0) printf("%5.5f+%5.5fi\n", d_signal[offset+id].x, d_signal[offset+id].y);
}
}
extern
"C"
void
CUDA_beam_precoding
(
int
**
txdataF
,
int
***
weight
,
int
L_ssb
,
int
shift
,
int
fftsize
,
int
nb_symbols
,
int
nb_antenna_ports
,
int
nb_tx
){
cudaEvent_t
start
,
stop
;
float
time
;
...
...
@@ -216,6 +224,7 @@ extern "C" void CUDA_beam_precoding(int **txdataF, int ***weight, int L_ssb, int
//initial BF data;
gpuErrchk
(
cudaMemset
(
cu_ru
.
d_txdataF_BF
,
0
,
fftsize
*
nb_symbols
*
sizeof
(
int
)
*
nb_tx
)
);
gpuErrchk
(
cudaMemset
(
cu_ru
.
d_subtx
,
0
,
fftsize
*
nb_symbols
*
nb_tx
*
2
*
sizeof
(
int
))
);
//move data to gpu
int
slotsize
=
fftsize
*
nb_symbols
;
for
(
int
p
=
0
;
p
<
nb_antenna_ports
;
p
++
){
...
...
@@ -223,8 +232,6 @@ extern "C" void CUDA_beam_precoding(int **txdataF, int ***weight, int L_ssb, int
}
cudaEventRecord
(
start
);
int
threadNum
=
1024
;
int
blockNum
=
fftsize
*
nb_symbols
/
threadNum
;
int
div
=
1
<<
shift
;
for
(
int
aa
=
0
;
aa
<
nb_tx
;
aa
++
){
for
(
int
p
=
0
;
p
<
nb_antenna_ports
;
p
++
){
...
...
@@ -241,11 +248,12 @@ extern "C" void CUDA_beam_precoding(int **txdataF, int ***weight, int L_ssb, int
cudaEventRecord
(
start
);
dim3
block
(
3
,
8
);
dim3
block
(
14
,
2
,
1
);
dim3
thread
(
1024
);
conjMulAll
<<<
block
,
thread
>>>
(
cu_ru
.
d_txdataF
,
cu_ru
.
d_weight
,
cu_ru
.
d_
res
,
conjMulAll
<<<
block
,
thread
>>>
(
cu_ru
.
d_txdataF
,
cu_ru
.
d_weight
,
cu_ru
.
d_
subtx
,
fftsize
,
nb_symbols
,
nb_tx
,
nb_antenna_ports
);
combine
<<<
block
,
thread
>>>
(
cu_ru
.
d_res
,
cu_ru
.
d_txdataF_BF
,
block
=
dim3
(
8
,
2
,
1
);
combine
<<<
block
,
thread
>>>
(
cu_ru
.
d_subtx
,
cu_ru
.
d_signal
,
fftsize
,
nb_symbols
,
nb_tx
,
nb_antenna_ports
);
cudaEventRecord
(
stop
);
...
...
This diff is collapsed.
Click to expand it.
openair1/CUDA/cuda_struct.h
View file @
21c5e485
...
...
@@ -15,7 +15,7 @@ typedef struct cuda_cu_ru_t{
//beamforming precoding
int
*
d_txdataF
;
//14symb-port0, 14symb-port1, ......
int
*
d_weight
;
//[p * tx * fftsize]
int
*
d_
res
;
int
*
d_
subtx
;
//14symb-subport0, 14symb-subport1, ..., 14symb-subport0, 14symb-subport1, ...
//ifft
int
*
d_txdataF_BF
;
//14symb-tx0, 14symb-tx1, ......
...
...
@@ -26,6 +26,7 @@ typedef struct cuda_cu_ru_t{
extern
cuda_cu_ru
cu_ru
;
#if __cplusplus
}
#endif
...
...
This diff is collapsed.
Click to expand it.
openair1/CUDA/init_cuda.cu
View file @
21c5e485
...
...
@@ -41,6 +41,10 @@
cuda_cu_ru
cu_ru
;
__constant__
int
PORTSIZE
;
__constant__
int
SUBTXSIZE
;
__constant__
int
BW_PSIZE
;
extern
"C"
void
init_cuda
(
int
nb_tx
,
int
nb_symbols
,
int
fftsize
){
printf
(
"init_cuda %d %d %d
\n\n\n
"
,
nb_tx
,
nb_symbols
,
fftsize
);
...
...
@@ -49,7 +53,7 @@ extern "C" void init_cuda(int nb_tx, int nb_symbols, int fftsize){
//beamforming precoding
gpuErrchk
(
cudaMalloc
((
void
**
)
&
cu_ru
.
d_txdataF
,
sizeof
(
int
)
*
nb_tx
*
nb_antenna_ports
*
nb_symbols
*
fftsize
)
);
gpuErrchk
(
cudaMalloc
((
void
**
)
&
cu_ru
.
d_weight
,
sizeof
(
int
)
*
nb_tx
*
nb_antenna_ports
*
fftsize
)
);
gpuErrchk
(
cudaMalloc
((
void
**
)
&
cu_ru
.
d_
res
,
sizeof
(
int
)
*
nb_tx
*
nb_antenna_ports
*
fftsize
*
nb_symbols
)
);
gpuErrchk
(
cudaMalloc
((
void
**
)
&
cu_ru
.
d_
subtx
,
sizeof
(
int
)
*
nb_tx
*
fftsize
*
nb_symbols
*
2
)
);
//ifft
gpuErrchk
(
cudaMalloc
((
void
**
)
&
cu_ru
.
d_txdataF_BF
,
fftsize
*
sizeof
(
int
)
*
nb_symbols
*
nb_tx
)
);
...
...
@@ -57,6 +61,11 @@ extern "C" void init_cuda(int nb_tx, int nb_symbols, int fftsize){
gpuErrchk
(
cudaMalloc
((
void
**
)
&
cu_ru
.
d_data_wCP
,
fftsize
*
(
nb_symbols
+
1
)
*
nb_tx
*
sizeof
(
int
))
);
cufftErrchk
(
cufftPlan1d
(
&
cu_ru
.
plan
,
fftsize
,
CUFFT_C2C
,
nb_symbols
*
nb_tx
)
);
int
portSize
=
fftsize
*
nb_symbols
;
int
subtxsize
=
nb_tx
*
nb_symbols
*
fftsize
;
int
bw_psize
=
nb_tx
*
fftsize
;
gpuErrchk
(
cudaMemcpyToSymbol
(
PORTSIZE
,
&
portSize
,
sizeof
(
int
))
);
gpuErrchk
(
cudaMemcpyToSymbol
(
SUBTXSIZE
,
&
subtxsize
,
sizeof
(
int
))
);
gpuErrchk
(
cudaMemcpyToSymbol
(
BW_PSIZE
,
&
bw_psize
,
sizeof
(
int
))
);
}
This diff is collapsed.
Click to expand it.
openair1/SCHED_NR/nr_ru_procedures.c
View file @
21c5e485
...
...
@@ -131,6 +131,12 @@ void CUDA_prec_ofdm(RU_t *ru,int frame_tx,int tti_tx){
((
short
*
)
&
ru
->
common
.
txdataF
[
p
][
j
])[
0
]
=
1
;
((
short
*
)
&
ru
->
common
.
txdataF
[
p
][
j
])[
1
]
=
1
;
}
for
(
int
aa
=
0
;
aa
<
ru
->
nb_tx
;
aa
++
){
for
(
int
j
=
0
;
j
<
fp
->
ofdm_symbol_size
;
j
++
){
((
short
*
)
&
ru
->
beam_weights
[
0
][
aa
][
p
][
j
])[
0
]
=
2
;
((
short
*
)
&
ru
->
beam_weights
[
0
][
aa
][
p
][
j
])[
1
]
=
2
;
}
}
}
clock_t
start
,
end
;
start
=
clock
();
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment