Commit 689426e8 authored by NCTU CS ISIP's avatar NCTU CS ISIP

LDPC implement on Quadro p5000

parent 1f6dbb20
......@@ -23,6 +23,25 @@
cmake_minimum_required (VERSION 3.0)
##############################################
# Base CUDA setting
##############################################
find_package(CUDA)
message ("cuda include ${CUDA_INCLUDE_DIRS}")
message ("cuda library ${CUDA_LIBRARY_DIRS}")
add_definitions("-L/usr/local/cuda/lib64")
SET(CUDA_NVCC_FLAGS
"${CUDA_NVCC_FLAGS};-arch=sm_60")
# Disable warnings for CUDA
SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-lpthread;-w;-O3;--default-stream;per-thread;-I/usr/local/cuda/inc;-L/usr/local/cuda/lib -lcutil;-rdc=true;-lcudadevrt")
SET(CUDA_VERBOSE_BUILD ON)
SET(CUDA_HOST_COMPILER "/usr/bin/g++")
SET(CUDA_SEPARABLE_COMPILATION ON)
#########################################################
# Base directories, compatible with legacy OAI building #
#########################################################
......@@ -174,7 +193,7 @@ set(CMAKE_C_FLAGS
"${CMAKE_C_FLAGS} -DSTDC_HEADERS=1 -DHAVE_SYS_TYPES_H=1 -DHAVE_SYS_STAT_H=1 -DHAVE_STDLIB_H=1 -DHAVE_STRING_H=1 -DHAVE_MEMORY_H=1 -DHAVE_STRINGS_H=1 -DHAVE_INTTYPES_H=1 -DHAVE_STDINT_H=1 -DHAVE_UNISTD_H=1 -DHAVE_FCNTL_H=1 -DHAVE_ARPA_INET_H=1 -DHAVE_SYS_TIME_H=1 -DHAVE_SYS_SOCKET_H=1 -DHAVE_STRERROR=1 -DHAVE_SOCKET=1 -DHAVE_MEMSET=1 -DHAVE_GETTIMEOFDAY=1 -DHAVE_STDLIB_H=1 -DHAVE_MALLOC=1 -DHAVE_LIBSCTP -D${MKVER}"
)
set(CMAKE_CXX_FLAGS
"${CMAKE_CXX_FLAGS} ${C_FLAGS_PROCESSOR} -Wno-packed-bitfield-compat -fPIC -Wall -fno-strict-aliasing -rdynamic -std=c++11 -D${MKVER}"
"${CMAKE_CXX_FLAGS} ${C_FLAGS_PROCESSOR} -Wno-packed-bitfield-compat -fPIC -Wall -fno-strict-aliasing -rdynamic -std=c++11" #-D${MKVER}
)
add_definitions("-DASN_DISABLE_OER_SUPPORT")
......@@ -1300,6 +1319,7 @@ set(PHY_TURBOIF
)
set(PHY_LDPCSRC
${OPENAIR1_DIR}/PHY/CODING/nrLDPC_decoder/nrLDPC_decoder.c
${OPENAIR1_DIR}/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.cu
${OPENAIR1_DIR}/PHY/CODING/nrLDPC_encoder/ldpc_encoder.c
${OPENAIR1_DIR}/PHY/CODING/nrLDPC_encoder/ldpc_encoder2.c
${OPENAIR1_DIR}/PHY/CODING/nrLDPC_encoder/ldpc_generate_coefficient.c
......@@ -2695,15 +2715,34 @@ target_link_libraries(smallblocktest
m pthread ${ATLAS_LIBRARIES} dl
)
add_executable(ldpctest
###################################################
# For CUDA library
###################################################
CUDA_ADD_LIBRARY(LDPC_CU
${OPENAIR1_DIR}/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.cu
)
cuda_add_executable(ldpctest
${OPENAIR1_DIR}/PHY/CODING/TESTBENCH/ldpctest.c
${T_SOURCE}
${SHLIB_LOADER_SOURCES}
)
target_link_libraries(ldpctest
-Wl,--start-group UTIL SIMU PHY_NR CONFIG_LIB -Wl,--end-group
target_link_libraries(ldpctest -ldl
-Wl,--start-group
LDPC_CU UTIL SIMU PHY_NR CONFIG_LIB
-Wl,--end-group
m pthread ${ATLAS_LIBRARIES} dl
)
#add_executable(ldpctest
# ${OPENAIR1_DIR}/PHY/CODING/TESTBENCH/ldpctest.c
# ${T_SOURCE}
# ${SHLIB_LOADER_SOURCES}
# )
#target_link_libraries(ldpctest
# -Wl,--start-group UTIL SIMU PHY_NR CONFIG_LIB -Wl,--end-group
# m pthread ${ATLAS_LIBRARIES} dl
# )
add_executable(nr_dlschsim
${OPENAIR1_DIR}/SIMULATION/NR_PHY/dlschsim.c
......
......@@ -697,7 +697,8 @@ function main() {
if [ "$SIMUS_PHY" = "1" ] ; then
echo_info "Compiling physical unitary tests simulators"
# TODO: fix: dlsim_tm4 pucchsim prachsim pdcchsim pbchsim mbmssim
simlist="dlsim ulsim ldpctest polartest smallblocktest nr_pbchsim nr_dlschsim nr_ulschsim nr_dlsim nr_ulsim nr_pucchsim"
#simlist="dlsim ulsim ldpctest polartest smallblocktest nr_pbchsim nr_dlschsim nr_ulschsim nr_dlsim nr_ulsim nr_pucchsim"
simlist="ldpctest"
for f in $simlist ; do
compilations \
phy_simulators $f \
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
### LDPC DECODER
+ `intuitive_ldpc`:
Intuitive methods of LDPC decoder. This method stores the whole expanded Base Graph, CN, BN buffer as they are. Then, does the cn and bn process according to the equation in `nrLDPC.pdf`.
+ `optimized_ldpc`:
This version inherits the idea of current oai ldpc_decoder. It stores lots of LUTs and shrinks the cn, bnbuffer size comparing to the former implementation. Right now, it only supports the longest code block 8448.
### Usage
+ `make ldpc` will compile the program `ldpc`.
+ `make prof [num=<0~100>]` will show the detail of GPU activity.
+ `make test` will build ldpc executable and run `check.sh` to verify the correctness of the implementation.
### Verification
+ The input (channel output) of the LDPC decoder is produced by the `ldpctest` program from OAI.
+ The verification is done by comparing the data decoded with the input data (channel output), and also the data output(`estimated_output`) produced by oai `ldpctest`.
NVCC = nvcc
CFLAGS =
PROF = nvprof
exec = ldpc
file = channel_output.txt
OBJS = util.o
code = 8448
num = 1
.PHONY: clean debug
## Target-specific Variable Values
debug: CFLAGS += -Ddebug -g -G
debug: clean $(exec)
cuda-gdb -q --args $(exec) -l $(code) -f $(file)
$(exec): $(exec).cu $(OBJS)
$(NVCC) $(CFLAGS) -o $@ $^
%.o: %.cu
cp ../test_input/8448/$(num).txt $(file)
$(NVCC) $(CFLAGS) -o $@ -c $<
run: clean $(exec)
./$(exec) -l $(code) -f $(file)
prof: clean $(exec)
cp ../test_input/8448/$(num).txt $(file)
$(PROF) ./$(exec) -l $(code) -f $(file)
test: clean $(exec)
./check.sh $(code)
rm $(OBJS) $(file)
clean:
rm -rf *.o test.* $(exec) CN* BN* log.txt channel_output.txt
#!/bin/bash
exec=ldpc
help()
{
echo "Usage: $0 <code_length>"
}
main()
{
if [ -z $@ ]; then
help
exit 1
fi
dir=$@
files=`ls ../test_input/$dir | grep txt`
# files="1.txt 2.txt 3.txt"
index=1
for file in $files
do
cp ../test_input/$dir/$file channel_output.txt
echo "==== test $index ====" >> log.txt
./$exec -l $dir -f channel_output.txt >> log.txt
echo ' ' >> log.txt
index=$((index+1))
done
}
main $@
#include <stdio.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <stdint.h>
#include <unistd.h>
#include <string.h>
#include "../BGs/isip_ldpc_bg1_i0.h"
#include "../BGs/isip_ldpc_bg1_i1.h"
#include "../BGs/isip_ldpc_bg1_i2.h"
#include "../BGs/isip_ldpc_bg1_i3.h"
#include "../BGs/isip_ldpc_bg1_i4.h"
#include "../BGs/isip_ldpc_bg1_i5.h"
#include "../BGs/isip_ldpc_bg1_i6.h"
#include "../BGs/isip_ldpc_bg1_i7.h"
#include "../BGs/isip_ldpc_bg2_i0.h"
#include "../BGs/isip_ldpc_bg2_i1.h"
#include "../BGs/isip_ldpc_bg2_i2.h"
#include "../BGs/isip_ldpc_bg2_i3.h"
#include "../BGs/isip_ldpc_bg2_i4.h"
#include "../BGs/isip_ldpc_bg2_i5.h"
#include "../BGs/isip_ldpc_bg2_i6.h"
#include "../BGs/isip_ldpc_bg2_i7.h"
#include "util.h"
#define TNPB 35
#define BNPG 1024
#define ITER 5
__constant__ int BG_GPU[46*68];
__global__
void BNProcess(int flag, int *BN, int *CN, int *CNbuf, const int *const_llr, int BG_col, int BG_row, int Zc)
{
int *CNG = (flag)? CN : CNbuf;
int id = blockIdx.x*blockDim.x + threadIdx.x;
for(int col = id; col < BG_col*Zc; col += (TNPB*BNPG))
{
int tmp = const_llr[col];
for(int row = 0; row < BG_row; row++)
{
int up_shift = (BG_GPU[col/Zc + row*BG_col] - 1)%Zc;
if(up_shift != -1)
{
int row_idx = col%Zc;
row_idx = row_idx - up_shift;
if(row_idx < 0) row_idx = Zc + row_idx;
row_idx = row*Zc + row_idx;
tmp = tmp + CNG[row_idx*BG_col*Zc + col];
}
}
BN[col] = tmp;
}
__syncthreads();
}
__global__ void CNProcess(int flag, int *BN, int *CN, int *CNbuf, int BG_col, int BG_row, int Zc)
{
int *CNG = (flag)? CN : CNbuf;
int *SCNG = (flag)? CNbuf : CN;
int id = blockIdx.x*blockDim.x + threadIdx.x;
for(int row = id; row < BG_row*Zc; row += (TNPB*BNPG))
{
for(int col = 0; col < BG_col; col++)
{
int right_shift = BG_GPU[(row/Zc)*BG_col + col] -1;
if(right_shift != -1)
{
int row_idx = row;
// int col_idx = ((row%384) + right_shift%384) %384 + col*384;
int col_idx = (row + right_shift) %Zc + col*Zc;
int sgn_cnt = 0, min = INT32_MAX;
for(int comp = 0; comp < BG_col; comp++)
{
if(comp == col) continue;
int comp_right_shift = BG_GPU[(row/Zc)*BG_col + comp] -1;
if(comp_right_shift != -1)
{
int comp_row_idx = row;
// int comp_col_idx = ((row%384) + (comp_right_shift%384)) %384 + comp*384;
int comp_col_idx = (row + comp_right_shift) %Zc + comp*Zc;
int tmp = BN[comp_col_idx] - CNG[comp_row_idx*BG_col*Zc + comp_col_idx];
if(tmp < 0)
{
tmp = -tmp;
sgn_cnt++;
}
if(tmp < min) min = tmp;
}
}
SCNG[row_idx*BG_col*Zc + col_idx] = (sgn_cnt%2 == 0)? min: -min;
}
}
}
__syncthreads();
}
__global__ void BitDetermination(int *BN, unsigned int *decode_d)
{
__shared__ int tmp[256];
int tid = blockIdx.x*256 + threadIdx.x;
int bid = threadIdx.x;
tmp[bid] = 0;
if(BN[tid] < 0)
{
tmp[bid] = 1 << (bid&7);
}
__syncthreads();
if(threadIdx.x < 32)
{
decode_d[blockIdx.x*32 + threadIdx.x] = 0;
for(int i = 0; i < 8; i++)
{
decode_d[blockIdx.x*32 + threadIdx.x] += tmp[threadIdx.x*8+i];
}
}
}
// helper function
void printllr(const char *name, int *src, int *des, int count, int type_size)
{
cudaCheck( cudaMemcpy((void *)des, (const void *)src, count*type_size, cudaMemcpyDeviceToHost) );
FILE *fp = fopen(name, "w");
if(!fp) printf("[error]: open file %s failed\n", name);
for(int i = 0; i < count; i++){
fprintf(fp, "llr[%d]= %d\n", i, des[i]);
}
}
int main(int argc, char* argv[])
{
int opt = 0, block_length = 0, BG = 0, Kb = 0, Zc = 0, BG_row = 0, BG_col = 0, lift_index = 0;
char file[50] = {};
short lift_size[51] = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18,20,22,24,26,28,30,32,36,40,44,48,52,56,60,64,72,80,88,96,104,112,120,128,144,160,176,192,208,224,240,256,288,320,352,384};
short lift_set[][9] = {
{2,4,8,16,32,64,128,256},
{3,6,12,24,48,96,192,384},
{5,10,20,40,80,160,320},
{7,14,28,56,112,224},
{9,18,36,72,144,288},
{11,22,44,88,176,352},
{13,26,52,104,208},
{15,30,60,120,240},
{0}};
while( (opt = getopt(argc, argv, "l:f:")) != -1){
switch(opt){
case 'l':
block_length = atoi(optarg);
break;
case 'f':
strncpy(file, optarg, strlen(optarg));
break;
default:
fprintf(stderr, "Usage: %s [-l code block length] <-f input file>\n", argv[0]);
exit(1);
}
}
if(block_length == 0 || file[0] == ' '){
fprintf(stderr, "no input file specified or code block length == 0");
}
if(block_length > 3840){
BG = 1;
Kb = 22;
BG_row = 46;
BG_col = 68;
}else if(block_length <= 3840){
BG = 2;
BG_row = 42;
BG_col = 52;
if(block_length > 640)
Kb = 10;
else if(block_length > 560)
Kb = 9;
else if(block_length > 192)
Kb = 8;
else
Kb = 6;
}
for(int i = 0; i < 51; i++){
if(lift_size[i] >= (double)block_length/Kb){
Zc = lift_size[i];
break;
}
}
for(int i = 0; lift_set[i][0] != 0; i++){
for(int j = 0; lift_set[i][j] != 0; j++){
if(Zc == lift_set[i][j]){
lift_index = i;
break;
}
}
}
int *BG_CPU = NULL;
switch(lift_index){
case 0:
BG_CPU = (BG == 1)? BG1_I0:BG2_I0;
break;
case 1:
BG_CPU = (BG == 1)? BG1_I1:BG2_I1;
break;
case 2:
BG_CPU = (BG == 1)? BG1_I2:BG2_I2;
break;
case 3:
BG_CPU = (BG == 1)? BG1_I3:BG2_I3;
break;
case 4:
BG_CPU = (BG == 1)? BG1_I4:BG2_I4;
break;
case 5:
BG_CPU = (BG == 1)? BG1_I5:BG2_I5;
break;
case 6:
BG_CPU = (BG == 1)? BG1_I6:BG2_I6;
break;
case 7:
BG_CPU = (BG == 1)? BG1_I7:BG2_I7;
break;
}
// printf("BG %d lift_index %d Zc %d BG_row %d BG_col %d\n", BG, lift_index, Zc, BG_row, BG_col);
// alloc cpu memory
unsigned int *input = (unsigned int*)malloc(sizeof(unsigned int)*8448/8), *decode_output_d, *decode_output_h;
int *BN, *CN, *CNbuf, *channel_output, *const_llr;
// debug
// int *p_BN = (int*)calloc(68*384, sizeof(int));
// int *p_CN = (int*)calloc(68*384*46*384, sizeof(int));
int *debug_llr = (int*)calloc(68*384, sizeof(int));
cudaCheck( cudaHostAlloc((void**)&channel_output, 68*384*sizeof(int), cudaHostAllocDefault) );
cudaCheck( cudaHostAlloc((void**)&decode_output_h, (8448/8)*sizeof(unsigned int), cudaHostAllocMapped) );
// | cudaHostAllocPortable | cudaHostAllocMapped | cudaHostAllocWriteCombined);
// read data from input file
ReadDataFromFile(file, input, channel_output, block_length, BG_col, Zc);
// alloc gpu memory
// BG
cudaCheck( cudaMemcpyToSymbol(BG_GPU, BG_CPU, BG_col*BG_row*sizeof(int)) );
// LLR CN BN BUF
size_t p_llr;
cudaCheck( cudaMallocPitch((void**)&const_llr, &p_llr, 68*384*sizeof(int), 1) );
cudaCheck( cudaMallocPitch((void**)&BN, &p_llr, 68*384*sizeof(int), 1) );
cudaCheck( cudaMallocPitch((void**)&CN, &p_llr, 68*384*sizeof(int), 46*384) );
cudaCheck( cudaMallocPitch((void**)&CNbuf, &p_llr, 68*384*sizeof(int), 46*384) );
cudaCheck( cudaMemcpyAsync((void*)const_llr, (const void*)channel_output, 68*384*sizeof(int), cudaMemcpyHostToDevice) );
cudaCheck( cudaMemcpyAsync((void*)BN, (const void*)channel_output, 68*384*sizeof(int), cudaMemcpyHostToDevice) );
cudaCheck( cudaHostGetDevicePointer((void**)&decode_output_d, (void*)decode_output_h, 0) );
cudaDeviceSynchronize();
cudaEvent_t start, end;
float time;
cudaEventCreate(&start);
cudaEventCreate(&end);
cudaEventRecord(start,0);
dim3 grid(BNPG, 1, 1);
dim3 block(TNPB, 1, 1);
int flag = 0;
char str[20] = {};
for(int it = 0; it < ITER; it++){
CNProcess<<<grid, block>>>(flag, BN, CN, CNbuf, BG_col, BG_row, Zc);
flag = (flag+1)&1;
BNProcess<<<grid, block>>>(flag, BN, CN, CNbuf, const_llr, BG_col, BG_row, Zc);
#ifdef debug
snprintf(str, 20, "%s_%d", "llr", it);
printllr(str, BN, debug_llr, 68*384, sizeof(int));
#endif
}
BitDetermination<<<33, 256>>>(BN, decode_output_d);
cudaDeviceSynchronize();
cudaEventRecord(end,0);
cudaEventSynchronize(end);
cudaEventElapsedTime(&time, start, end);
int err_num = 0;
for(int i = 0; i < block_length/8; i++){
if(input[i] != decode_output_h[i]){
printf("input[%d] = %d, decode_output[%d] = %d\n", i, input[i], i, decode_output_h[i]);
err_num++;
}
}
printf("err_num == %d\n", err_num);
printf("decode time:%f ms\n",time);
// free resource
free(input);
// free(p_BN);
// free(p_CN);
cudaFreeHost(channel_output);
cudaFreeHost(decode_output_h);
cudaFree(const_llr);
cudaFree(BN);
cudaFree(CN);
cudaFree(CNbuf);
return 0;
}
#include <stdio.h>
#include <cuda_runtime.h>
#include "util.h"
void ReadDataFromFile(const char *file, unsigned int *input_data_arr, int *channel_data_arr, int block_length, int BG_col, int Zc)
{
// static const char testin[] = "../test_input/test_case_1.txt";
file_t inputfile;
strcpy(inputfile.filename, file);
inputfile.fptr = fopen(inputfile.filename, "r");
if(inputfile.fptr == NULL)
{
puts("cannot open file");
}
// data processing
fgets(inputfile.tmp, 100, inputfile.fptr); // get rid of gen test
for(int i = 0; i < block_length/8; i++)
{
fscanf(inputfile.fptr, "%d", &input_data_arr[i]);
}
fgets(inputfile.tmp, 100, inputfile.fptr); // get rid of '\n'
fgets(inputfile.tmp, 100, inputfile.fptr); // get rid of test end
fgets(inputfile.tmp, 100, inputfile.fptr); // get rid of channel
/*
for(int i = 0; i < 2*384; i++)
{
channel_data_arr[i] = 0;
}
*/
for(int i = 0; i < BG_col*Zc; i++)
{
fscanf(inputfile.fptr, "%d", &channel_data_arr[i]);
}
fclose(inputfile.fptr);
}
#ifndef __UTIL_H__
#define __UTIL_H__
// cuda check macro
#define cudaCheck(ans) { cudaAssert((ans), __FILE__, __LINE__); }
inline void cudaAssert(cudaError_t code, const char *file, int line)
{
if (code != cudaSuccess){
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
exit(code);
}
}
// struct
typedef struct
{
FILE* fptr;
char filename[64];
char tmp[64];
}file_t;
// utility
void ReadDataFromFile(const char*, unsigned int*, int*, int, int, int);
__global__ void BNProcess(int flag, int *BN, int *CN, int *CNbuf, const int *const_llr);
__global__ void CNProcess(int flag, int *BN, int *CN, int *CNbuf);
__global__ void BitDetermination(int *BN, unsigned int *decode_d);
/*
int32_t nrLDPC_decoder(t_nrLDPC_dec_params* p_decParams,
int8_t* p_llr, int8_t* p_out,
t_nrLDPC_procBuf* p_procBuf,
t_nrLDPC_time_stats* p_profiler){
return iter;
}
static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out,
t_nrLDPC_procBuf* p_procBuf,
uint32_t numLLR, t_nrLDPC_lut* p_lut,
t_nrLDPC_dec_params* p_decParams,
t_nrLDPC_time_stats* p_profiler){
return iter;
}
*/
#endif
EXEC = gen_idx
TXT = gen_matrix.txt
DEBUG = debug
FILE = channel_output.txt
num = 1
CFLAGS=
gen: clean gen_idx.c
$(CC) $(EXEC).c -o $(EXEC) -g
ldpc: clean ldpc.cu
cp ../test_input/8448/$(num).txt $(FILE)
nvcc $(CFLAGS) ldpc.cu -o ldpc
mkdir debug
test: ldpc
./check.sh
cgdb: CFLAGS += -g -G
cgdb: clean ldpc
cuda-gdb --silent --args ldpc $(FILE)
prof: clean ldpc
cp ../test_input/8448/$(num).txt $(FILE)
nvprof ./ldpc $(FILE)
.PHONY: clean
clean:
$(RM) -rf $(DEBUG) $(EXEC) $(TXT) ldpc log.txt $(FILE)
set +x
index=1
for file in ../test_input/8448/*
do
cp $file channel_output.txt
echo "===== test $index =====" >> log.txt
./ldpc channel_output.txt >> log.txt
echo ' ' >> log.txt
index=$((index+1))
done
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifndef test
#include "../BGs/isip_ldpc_bg1_i1.h"
#endif
int l2c_idx[316*384] = {}; // cnbuf[ tid ] = llr[ l2c_idx[tid] ]
int l2b_idx[316*384] = {}; // bnbuf[ tid ] = llr[ l2b_idx[tid] ]
int llr_idx[26113] = {}; // bnbuf2llr, start = llr_idx[tid], end = llr_idx[tid+1], llrbuf[tid] = sum of (bnbuf[start] to bnbuf[end])
int c2b_idx[316*384] = {}; // bnbuf[ tid ] = cnbuf[ c2b_idx[tid] ]
int b2c_idx[316*384] = {}; // cnbuf[ tid ] = bnbuf[ b2c_idx[tid] ]
int cnproc_start_idx[316*384] = {}; // index for cnproc, int start = cnproc_start_idx[tid]
int cnproc_end_idx[316*384] = {}; // index for cnproc, int end = cnproc_end_idx[tid]
int bnproc_start_idx[316*384] = {}; // index for bnproc, int start = bnproc_start_idx[tid]
int bnproc_end_idx[316*384] = {}; // index for bnproc, int end = bnproc_end_idx[tid]
int *matrix_transpose(int *matrix, int row, int col)
{
}
void write_to_file(FILE *fp, const char *arr_name, int *arr, int size)
{
fprintf(fp, "%s[%d] = { %d", arr_name, size, arr[0]);
for(int i = 1; i < size; i++){
fprintf(fp, ", %d", arr[i]);
}
fprintf(fp, "};\n");
}
void matrix_expansion(const int *BG, int row, int col, int Zc, int *matrix)
{
for(int i = 0; i < row; i++){
for(int j = 0; j < col; j++){
int val = BG[i * col + j];
if(val != 0){
val = (val-1)%Zc;
for(int k = i*Zc; k < (i+1)*Zc; k++){
int idx = k*(col*Zc) + (j*Zc+val);
matrix[idx] = 1;
val = (val+1)%Zc;
}
}
}
}
}
void print_matrix(const int *M, int row, int col)
{
printf("Matrix:\n{\n");
for(int i = 0; i < row; i++){
printf("\t");
for(int j = 0; j < col; j++){
printf(" %d,", M[i*col + j]);
}
printf("\n");
}
printf("}\n");
}
void build_index(const int *BG, int row, int col, int Zc)
{
int *matrix = (int*)malloc(row*col*Zc*Zc*sizeof(int));
int cnidx = 0, bnidx = 0, aidx = 0, pidx = 0, lidx1 = 0, lidx2 = 0;
matrix_expansion(BG, row, col, Zc, matrix);
// cn label & l2c_idx
for(int i = 0; i < row*Zc; i++){
for(int j = 0; j < col*Zc; j++){
int k = i*col*Zc + j;
if(matrix[k] == 1){
matrix[k] = cnidx+1;
l2c_idx[cnidx] = j;
cnidx++;
}
}
}
// build c2b_idx & bnproc_idx
for(int i = 0; i < col*Zc; i++){
int cnt = 0;
int start = aidx, end = 0;
for(int j = 0; j < row*Zc; j++){
int val = matrix[i + j*col*Zc];
if(val != 0){
c2b_idx[aidx] = val -1;
aidx++, cnt++;
}
}
end = start + cnt;
while(cnt--){
bnproc_start_idx[pidx] = start;
bnproc_end_idx[pidx] = end;
pidx++;
}
}
printf("cnidx %d\naidx %d\npidx %d\n", cnidx, aidx, pidx);
matrix_expansion(BG, row, col, Zc, matrix);
// bn label & llr_idx
for(int i = 0; i < col*Zc; i++){
llr_idx[lidx1++] = bnidx;
for(int j = 0; j < row*Zc; j++){
int k = i + j*col*Zc;
if(matrix[k] == 1){
matrix[k] = bnidx+1;
l2b_idx[lidx2++] = i;
bnidx++;
}
}
}
llr_idx[lidx1] = bnidx;
// build b2c_idx & cnproc_idx
aidx = pidx = 0;
for(int i = 0; i < row*Zc; i++){
int cnt = 0;
int start = aidx, end = 0;
for(int j = 0; j < col*Zc; j++){
int val = matrix[i*col*Zc + j];
if(val != 0){
b2c_idx[aidx] = val - 1;
aidx++, cnt++;
}
}
end = start + cnt;
while(cnt--){
cnproc_start_idx[pidx] = start;
cnproc_end_idx[pidx] = end;
pidx++;
}
}
printf("bnidx %d\naidx %d\npidx %d\n", bnidx, aidx, pidx);
printf("lidx1 %d\nlidx2 %d\n", lidx1, lidx2);
free(matrix);
}
void generate_header(const char *file, int col, int entry, int Zc)
{
#ifdef test
FILE *f = fopen("test.h", "w");
#else
FILE *f = fopen(file, "w");
#endif
write_to_file(f, "int l2c_idx", l2c_idx, entry*Zc);
write_to_file(f, "int c2b_idx", c2b_idx, entry*Zc);
write_to_file(f, "int b2c_idx", b2c_idx, entry*Zc);
write_to_file(f, "int cnproc_start_idx", cnproc_start_idx, entry*Zc);
write_to_file(f, "int cnproc_end_idx", cnproc_end_idx, entry*Zc);
write_to_file(f, "int bnproc_start_idx", bnproc_start_idx, entry*Zc);
write_to_file(f, "int bnproc_end_idx", bnproc_end_idx, entry*Zc);
write_to_file(f, "int llr_idx", llr_idx, col*Zc+1);
write_to_file(f, "int l2b_idx", l2b_idx, entry*Zc);
fclose(f);
}
int main(int argc, char** argv)
{
int test_BG[4*5] ={ 2, 0, 1, 2, 0,
0, 3, 0, 2, 2,
1, 0, 3, 3, 0,
0, 3, 0, 0, 1};
// default
int idx = 0, Zc = 384, BG = 1;
int max_row = 46, max_col = 68;
int entry = 316;
char *header_file = "bg1_i1_index_array.h";
#ifdef test
if(argc == 1){
printf("\nusage: %s [BG_row] [BG_col] [Zc] [entry] [output_header]\n\n", argv[0]);
return -1;
}
#endif
if(argc == 6){
max_row = atoi(argv[1]);
max_col = atoi(argv[2]);
Zc = atoi(argv[3]);
entry = atoi(argv[4]);
header_file = argv[5];
printf("max_row %d, max_col %d, Zc %d\n", max_row, max_col, Zc);
}
#ifdef test
build_index(test_BG, max_row, max_col, Zc);
#else
build_index(BG1_I1, max_row, max_col, Zc);
#endif
generate_header(header_file, max_col, entry, Zc);
return 0;
}
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include "bg1_i1_index_array.h"
void print_arr_cpu(const char *file, int *arr, int size)
{
FILE *fp = fopen(file, "w");
for(int i = 0; i < size; i++){
fprintf(fp, "%s[%d]: %d\n", file, i, arr[i]);
}
fclose(fp);
}
void print_arr(const char *file, int *arr, int size)
{
int *tmp = (int*)malloc(sizeof(int)*size);
FILE *fp = fopen(file, "w");
cudaMemcpy((void*)tmp, (const void*)arr, size*sizeof(int), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
for(int i = 0; i < size; i++){
fprintf(fp, "%s[%d]: %d\n", file, i, tmp[i]);
}
free(tmp);
fclose(fp);
}
__global__ void llr2CN(float *llr, float *cnbuf, int *l2c_idx)
{
int tid = blockIdx.x*blockDim.x + threadIdx.x;
cnbuf[tid] = llr[l2c_idx[tid]];
__syncthreads();
}
__global__ void llr2BN(float *llr, float *const_llr, int *l2b_idx)
{
int tid = blockIdx.x*blockDim.x + threadIdx.x;
const_llr[tid] = llr[l2b_idx[tid]];
__syncthreads();
}
__global__ void CNProcess(float *cnbuf, float *bnbuf, int *b2c_idx, int *cnproc_start_idx, int *cnproc_end_idx)
{
int tid = blockIdx.x*blockDim.x + threadIdx.x;
int start = cnproc_start_idx[tid];
int end = cnproc_end_idx[tid];
int sgn = 1, val = INT32_MAX;
for(int i = start; i < end; i++){
if(i == tid) continue;
int tmp = cnbuf[i];
if(tmp < 0){
tmp = -tmp;
sgn = -sgn;
}
if(val > tmp){
val = tmp;
}
}
bnbuf[b2c_idx[tid]] = sgn*val;// + const_llr[tid];
__syncthreads();
}
__global__ void add(int *bnbuf, int start, int pid, int *buf)
{
__shared__ int sdata[25];
int tid = threadIdx.x;
int num = blockDim.x;
sdata[tid] = bnbuf[start+tid];
for(int s = num/2; s > 0; s>>=1){
if(tid < s){
sdata[tid] += sdata[tid+s];
}
}
if(tid == 0){
buf[pid] = sdata[tid];
}
}
__global__ void BNProcess(float *const_llr, float *bnbuf, float *cnbuf, int *c2b_idx, int *bnproc_start_idx, int *bnproc_end_idx, float *resbuf)
{
int tid = blockIdx.x*blockDim.x + threadIdx.x;
float val = 0.0;
int start = bnproc_start_idx[tid];
int end = bnproc_end_idx[tid];
for(int i = start; i < end; i++){
if(i == tid) continue;
val += bnbuf[i];
}
// cnbuf[c2b_idx[tid]] = resbuf[tid] + const_llr[tid];
cnbuf[c2b_idx[tid]] = val + const_llr[tid];
__syncthreads();
}
__global__ void BN2llr(float *const_llr, float *bnbuf, float *llrbuf, int *llr_idx)
{
int tid = blockIdx.x*blockDim.x + threadIdx.x;
int start = llr_idx[tid];
int end = llr_idx[tid+1];
int res = 0.0;
for(int i = start; i < end; i++){
res += bnbuf[i];
}
llrbuf[tid] = res + const_llr[tid];
__syncthreads();
}
__global__ void BitDetermination(float *BN, unsigned int *decode_d)
{
__shared__ int tmp[256];
int tid = blockIdx.x*256 + threadIdx.x;
int bid = threadIdx.x;
tmp[bid] = 0;
if(BN[tid] < 0)
{
tmp[bid] = 1 << (bid&7);
}
__syncthreads();
if(threadIdx.x < 32)
{
decode_d[blockIdx.x*32 + threadIdx.x] = 0;
for(int i = 0; i < 8; i++)
{
decode_d[blockIdx.x*32 + threadIdx.x] += tmp[threadIdx.x*8+i];
}
}
}
void Read_Data(char *filename, int *data_sent, float *data_received)
{
FILE *fp = fopen(filename, "r");
fscanf(fp, "%*s");
for(int i = 0; i < 1056; i++){
fscanf(fp, "%d", &data_sent[i]);
}
fscanf(fp, "%*s");
fscanf(fp, "%*s");
fscanf(fp, "%*s");
for(int i = 0; i < 26112; i++){
fscanf(fp, "%f", &data_received[i]);
}
fclose(fp);
}
int main(int argc, char **argv)
{
int code_length = 8448, BG = 1;
int *input = (int*)malloc(1056*sizeof(int));
float *llr = (float*)malloc(26112*sizeof(float));
float *llr_d, *llrbuf_d, *const_llr_d, *cnbuf_d, *bnbuf_d, *resbuf_d;
unsigned int *decode_output_h, *decode_output_d;
int *l2c_idx_d, *cnproc_start_idx_d, *cnproc_end_idx_d, *c2b_idx_d, *bnproc_start_idx_d, *bnproc_end_idx_d, *b2c_idx_d, *llr_idx_d, *l2b_idx_d;
char *file = argv[1];
int blockNum = 237, threadNum = 512;
//int blockNum = 33, threadNum = 256;
//int blockNum = 17, threadNum = 512;
int rounds = 5, Zc = 384;
Read_Data(file, input, llr);
size_t p_llr;
cudaHostAlloc((void**)&decode_output_h, 1056*sizeof(unsigned int), cudaHostAllocMapped);
cudaMallocPitch((void**)&llr_d, &p_llr, 26112*sizeof(float), 1);
cudaMallocPitch((void**)&llrbuf_d, &p_llr, 26112*sizeof(float), 1);
cudaMallocPitch((void**)&const_llr_d, &p_llr, 316*384*sizeof(float), 1);
cudaMallocPitch((void**)&cnbuf_d, &p_llr, 316*384*sizeof(float), 1);
cudaMallocPitch((void**)&bnbuf_d, &p_llr, 316*384*sizeof(float), 1);
cudaMallocPitch((void**)&l2c_idx_d, &p_llr, 316*384*sizeof(int), 1);
cudaMallocPitch((void**)&l2b_idx_d, &p_llr, 316*384*sizeof(int), 1);
cudaMallocPitch((void**)&cnproc_start_idx_d, &p_llr, 316*384*sizeof(int), 1);
cudaMallocPitch((void**)&cnproc_end_idx_d, &p_llr, 316*384*sizeof(int), 1);
cudaMallocPitch((void**)&c2b_idx_d, &p_llr, 316*384*sizeof(int), 1);
cudaMallocPitch((void**)&bnproc_start_idx_d, &p_llr, 316*384*sizeof(int), 1);
cudaMallocPitch((void**)&bnproc_end_idx_d, &p_llr, 316*384*sizeof(int), 1);
cudaMallocPitch((void**)&b2c_idx_d, &p_llr, 316*384*sizeof(int), 1);
cudaMallocPitch((void**)&resbuf_d, &p_llr, 316*384*sizeof(int), 1);
cudaMallocPitch((void**)&llr_idx_d, &p_llr, 26113*sizeof(int), 1);
cudaMemcpyAsync((void*)llr_d, (const void*)llr, 68*384*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpyAsync((void*)l2c_idx_d, (const void*)l2c_idx, 316*384*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync((void*)l2b_idx_d, (const void*)l2b_idx, 316*384*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync((void*)cnproc_start_idx_d, (const void*)cnproc_start_idx, 316*384*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync((void*)cnproc_end_idx_d, (const void*)cnproc_end_idx, 316*384*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync((void*)c2b_idx_d, (const void*)c2b_idx, 316*384*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync((void*)bnproc_start_idx_d, (const void*)bnproc_start_idx, 316*384*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync((void*)bnproc_end_idx_d, (const void*)bnproc_end_idx, 316*384*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync((void*)b2c_idx_d, (const void*)b2c_idx, 316*384*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync((void*)llr_idx_d, (const void*)llr_idx, 26113*sizeof(int), cudaMemcpyHostToDevice);
cudaHostGetDevicePointer((void**)&decode_output_d, (void*)decode_output_h, 0);
cudaDeviceSynchronize();
printf("BG %d, Zc %d, code_length %d\n", BG, Zc, code_length);
cudaEvent_t start, end;
float time;
cudaEventCreate(&start);
cudaEventCreate(&end);
cudaEventRecord(start, 0);
llr2CN<<<blockNum, threadNum>>>(llr_d, cnbuf_d, l2c_idx_d);
llr2BN<<<blockNum, threadNum>>>(llr_d, const_llr_d, l2b_idx_d);
/*
print_arr("debug/const_llr_d", const_llr_d, 26112);
print_arr("debug/cnbuf_d", cnbuf_d, 316*384);
print_arr("debug/const_llrbuf_d", const_llrbuf_d, 316*384);
*/
char dir[] = "debug/", cn[] = "cnbuf", bn[] = "bnbuf", llrstr[] = "llrbuf_d";
char str[100] = {};
for(int i = 0; i < rounds; i++){
CNProcess<<<blockNum, threadNum>>>(cnbuf_d, bnbuf_d, b2c_idx_d, cnproc_start_idx_d, cnproc_end_idx_d);
#ifdef debug
snprintf(str, 20, "%s%s_%d", dir, bn, i+1);
print_arr(str, bnbuf_d, 316*384);
#endif
BNProcess<<<blockNum, threadNum>>>(const_llr_d, bnbuf_d, cnbuf_d, c2b_idx_d, bnproc_start_idx_d, bnproc_end_idx_d, resbuf_d);
#ifdef debug
snprintf(str, 20, "%s%s_%d", dir, cn, i+1);
print_arr(str, cnbuf_d, 316*384);
#endif
BN2llr<<<51, 512>>>(llr_d, bnbuf_d, llrbuf_d, llr_idx_d);
#ifdef debug
snprintf(str, 20, "%s%s_%d", dir, llrstr, i+1);
print_arr(str, llrbuf_d, 26112);
#endif
}
BitDetermination<<<33, 256>>>(llrbuf_d, decode_output_d);
cudaDeviceSynchronize();
cudaEventRecord(end, 0);
cudaEventSynchronize(end);
cudaEventElapsedTime(&time, start, end);
printf("time: %.6f ms\n", time);
int err = 0;
for(int i = 0; i < 8448/8; i++){
if(input[i] != decode_output_h[i]){
// printf("input[%d] :%d, decode_output[%d]: %d\n", i, input[i], i, decode_output_h[i]);
err++;
}
}
printf("err: %d\n", err);
free(input);
free(llr);
cudaFree(llr_d);
cudaFree(llrbuf_d);
cudaFree(bnbuf_d);
cudaFree(cnbuf_d);
cudaFree(l2c_idx_d);
cudaFree(cnproc_start_idx_d);
cudaFree(cnproc_end_idx_d);
cudaFree(c2b_idx_d);
cudaFree(bnproc_start_idx_d);
cudaFree(bnproc_end_idx_d);
cudaFree(b2c_idx_d);
cudaFree(const_llr_d);
cudaFree(llr_idx_d);
cudaFree(resbuf_d);
cudaFreeHost(decode_output_h);
return 0;
}
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include "test.h"
void print_arr_cpu(const char *file, int *arr, int size)
{
FILE *fp = fopen(file, "w");
for(int i = 0; i < size; i++){
fprintf(fp, "%s[%d]: %d\n", file, i, arr[i]);
}
fclose(fp);
}
void print_arr(const char *file, int *arr, int size)
{
int *tmp = (int*)malloc(sizeof(int)*size);
FILE *fp = fopen(file, "w");
cudaMemcpy((void*)tmp, (const void*)arr, size*sizeof(int), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
for(int i = 0; i < size; i++){
fprintf(fp, "%s[%d]: %d\n", file, i, tmp[i]);
}
free(tmp);
fclose(fp);
}
__global__ void llr2CN(int *llr, int *cnbuf, int *l2c_idx)
{
int tid = blockIdx.x*blockDim.x + threadIdx.x;
cnbuf[tid] = llr[l2c_idx[tid]];
__syncthreads();
}
__global__ void llr2BN(int *llr, int *const_llr, int *l2b_idx)
{
int tid = blockIdx.x*blockDim.x + threadIdx.x;
const_llr[tid] = llr[l2b_idx[tid]];
__syncthreads();
}
__global__ void CNProcess(int *cnbuf, int *bnbuf, int *b2c_idx, int *cnproc_idx)
{
int tid = blockIdx.x*blockDim.x + threadIdx.x;
int start = cnproc_idx[tid*2];
int end = cnproc_idx[tid*2+1];
/*
if (tid == 6){
printf("start %d, end %d\n", start, end);
}
*/
int sgn = 1, val = INT32_MAX;
for(int i = start; i < end; i++){
if(i == tid) continue;
int tmp = cnbuf[i];
// if(tid == 6) printf("tmp %d\n", tmp);
if(tmp < 0){
tmp = -tmp;
sgn = -sgn;
}
if(val > tmp){
val = tmp;
// if(tid == 6)printf("val = tmp\n");
}
// if(tid == 6) printf("%d\n", val);
}
/*
if(tid == 6){
printf("b2c_idx[6] %d\n", b2c_idx[tid]);
printf("sgn*val = %d\n", sgn*val);
printf("\n\n");
}
*/
bnbuf[b2c_idx[tid]] = sgn*val;// + const_llr[tid];
__syncthreads();
}
__global__ void BNProcess(int *const_llr, int *bnbuf, int *cnbuf, int *c2b_idx, int *bnproc_idx)
{
int tid = blockIdx.x*blockDim.x + threadIdx.x;
int start = bnproc_idx[tid*2];
int end = bnproc_idx[tid*2+1];
int val = 0;
for(int i = start; i < end; i++){
if(i == tid) continue;
val += bnbuf[i];
}
cnbuf[c2b_idx[tid]] = val + const_llr[tid];
__syncthreads();
}
__global__ void BN2llr(int *bnbuf, int *llrbuf, int *llr_idx)
{
int tid = blockIdx.x*blockDim.x + threadIdx.x;
int start = llr_idx[tid];
int end = llr_idx[tid+1];
int res = 0;
for(int i = start; i < end; i++){
res += bnbuf[i];
}
llrbuf[tid] = res;
__syncthreads();
}
__global__ void BitDetermination(int *BN, unsigned int *decode_d)
{
__shared__ int tmp[256];
int tid = blockIdx.x*256 + threadIdx.x;
int bid = threadIdx.x;
tmp[bid] = 0;
if(BN[tid] < 0)
{
tmp[bid] = 1 << (bid&7);
}
__syncthreads();
if(threadIdx.x < 32)
{
decode_d[blockIdx.x*32 + threadIdx.x] = 0;
for(int i = 0; i < 8; i++)
{
decode_d[blockIdx.x*32 + threadIdx.x] += tmp[threadIdx.x*8+i];
}
}
}
int main(int argc, char **argv)
{
int llr[15] = {-48, 27, -47, 13, 34,
-41, 51, 29, -6, -19,
-45, -42, -40, -6, -33};
int *llr_d, *llrbuf_d, *const_llr_d, *cnbuf_d, *bnbuf_d;
int *l2c_idx_d, *cnproc_idx_d, *c2b_idx_d, *bnproc_idx_d, *b2c_idx_d, *llr_idx_d, *l2b_idx_d;
int blockNum = 6, threadNum = 11;
int rounds = 5, Zc = 3;
size_t p_llr;
cudaMallocPitch((void**)&llr_d, &p_llr, 15*sizeof(int), 1);
cudaMallocPitch((void**)&llrbuf_d, &p_llr, 15*sizeof(int), 1);
cudaMallocPitch((void**)&const_llr_d, &p_llr, 33*sizeof(int), 1);
cudaMallocPitch((void**)&cnbuf_d, &p_llr, 33*sizeof(int), 1);
cudaMallocPitch((void**)&bnbuf_d, &p_llr, 33*sizeof(int), 1);
cudaMallocPitch((void**)&l2c_idx_d, &p_llr, 33*sizeof(int), 1);
cudaMallocPitch((void**)&l2b_idx_d, &p_llr, 33*sizeof(int), 1);
cudaMallocPitch((void**)&cnproc_idx_d, &p_llr, 33*2*sizeof(int), 1);
cudaMallocPitch((void**)&c2b_idx_d, &p_llr, 33*sizeof(int), 1);
cudaMallocPitch((void**)&bnproc_idx_d, &p_llr, 33*2*sizeof(int), 1);
cudaMallocPitch((void**)&b2c_idx_d, &p_llr, 33*sizeof(int), 1);
cudaMallocPitch((void**)&llr_idx_d, &p_llr, 16*sizeof(int), 1);
cudaMemcpyAsync((void*)llr_d, (const void*)llr, 15*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync((void*)l2c_idx_d, (const void*)l2c_idx, 33*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync((void*)l2b_idx_d, (const void*)l2b_idx, 33*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync((void*)cnproc_idx_d, (const void*)cnproc_idx, 33*2*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync((void*)c2b_idx_d, (const void*)c2b_idx, 33*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync((void*)bnproc_idx_d, (const void*)bnproc_idx, 33*2*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync((void*)b2c_idx_d, (const void*)b2c_idx, 33*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync((void*)llr_idx_d, (const void*)llr_idx, 16*sizeof(int), cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
llr2CN<<<blockNum, threadNum>>>(llr_d, cnbuf_d, l2c_idx_d);
llr2CN<<<blockNum, threadNum>>>(llr_d, const_llr_d, l2b_idx_d);
/*
print_arr("debug/const_llr_d", const_llr_d, 26112);
print_arr("debug/cnbuf_d", cnbuf_d, 316*384);
print_arr("debug/const_llrbuf_d", const_llrbuf_d, 316*384);
*/
char debug[] = "debug/";
char cn[] = "cnbuf";
char bn[] = "bnbuf";
char llrstr[] = "llrbuf_d";
char str[100] = {};
for(int i = 0; i < rounds; i++){
CNProcess<<<blockNum, threadNum>>>(cnbuf_d, bnbuf_d, b2c_idx_d, cnproc_idx_d);
snprintf(str, 20, "%s%s_%d", debug, bn, i+1);
cudaDeviceSynchronize();
print_arr(str, bnbuf_d, 33);
BNProcess<<<blockNum, threadNum>>>(const_llr_d, bnbuf_d, cnbuf_d, c2b_idx_d, bnproc_idx_d);
snprintf(str, 20, "%s%s_%d", debug, cn, i+1);
print_arr(str, cnbuf_d, 33);
BN2llr<<<3,5>>>(bnbuf_d, llrbuf_d, llr_idx_d);
snprintf(str, 20, "%s%s_%d", debug, llrstr, i+1);
print_arr(str, llrbuf_d, 15);
}
// BitDetermination<<<33, 256>>>(llrbuf_d, decode_output_d);
cudaDeviceSynchronize();
cudaFree(llrbuf_d);
cudaFree(bnbuf_d);
cudaFree(cnbuf_d);
cudaFree(l2c_idx_d);
cudaFree(l2b_idx_d);
cudaFree(cnproc_idx_d);
cudaFree(c2b_idx_d);
cudaFree(bnproc_idx_d);
cudaFree(b2c_idx_d);
cudaFree(llr_d);
cudaFree(const_llr_d);
cudaFree(llr_idx_d);
return 0;
}
int l2c_idx[33] = { 1, 6, 10, 2, 7, 11, 0, 8, 9, 5, 10, 13, 3, 11, 14, 4, 9, 12, 0, 8, 11, 1, 6, 9, 2, 7, 10, 5, 12, 3, 13, 4, 14};
int c2b_idx[33] = { 6, 18, 0, 21, 3, 24, 12, 29, 15, 31, 9, 27, 1, 22, 4, 25, 7, 19, 8, 16, 23, 2, 10, 26, 5, 13, 20, 17, 28, 11, 30, 14, 32};
int b2c_idx[33] = { 2, 12, 21, 4, 14, 24, 0, 16, 18, 10, 22, 29, 6, 25, 31, 8, 19, 27, 1, 17, 26, 3, 13, 20, 5, 15, 23, 11, 28, 7, 30, 9, 32};
int cnproc_idx[66] = { 0, 3, 0, 3, 0, 3, 3, 6, 3, 6, 3, 6, 6, 9, 6, 9, 6, 9, 9, 12, 9, 12, 9, 12, 12, 15, 12, 15, 12, 15, 15, 18, 15, 18, 15, 18, 18, 21, 18, 21, 18, 21, 21, 24, 21, 24, 21, 24, 24, 27, 24, 27, 24, 27, 27, 29, 27, 29, 29, 31, 29, 31, 31, 33, 31, 33};
int bnproc_idx[66] = { 0, 2, 0, 2, 2, 4, 2, 4, 4, 6, 4, 6, 6, 8, 6, 8, 8, 10, 8, 10, 10, 12, 10, 12, 12, 14, 12, 14, 14, 16, 14, 16, 16, 18, 16, 18, 18, 21, 18, 21, 18, 21, 21, 24, 21, 24, 21, 24, 24, 27, 24, 27, 24, 27, 27, 29, 27, 29, 29, 31, 29, 31, 31, 33, 31, 33};
int llr_idx[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 21, 24, 27, 29, 31, 33};
int l2b_idx[33] = { 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 13, 13, 14, 14};
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment