Commit dfbc35ea authored by tyhsu's avatar tyhsu

Change type from integer to float for floating computation runs faster on GPU(Quadro P5000)

parent b128780e
......@@ -4,22 +4,22 @@
#include <stdint.h>
#include <unistd.h>
#include <string.h>
#include "BGs/isip_ldpc_bg1_i0.h"
#include "BGs/isip_ldpc_bg1_i1.h"
#include "BGs/isip_ldpc_bg1_i2.h"
#include "BGs/isip_ldpc_bg1_i3.h"
#include "BGs/isip_ldpc_bg1_i4.h"
#include "BGs/isip_ldpc_bg1_i5.h"
#include "BGs/isip_ldpc_bg1_i6.h"
#include "BGs/isip_ldpc_bg1_i7.h"
#include "BGs/isip_ldpc_bg2_i0.h"
#include "BGs/isip_ldpc_bg2_i1.h"
#include "BGs/isip_ldpc_bg2_i2.h"
#include "BGs/isip_ldpc_bg2_i3.h"
#include "BGs/isip_ldpc_bg2_i4.h"
#include "BGs/isip_ldpc_bg2_i5.h"
#include "BGs/isip_ldpc_bg2_i6.h"
#include "BGs/isip_ldpc_bg2_i7.h"
#include "../BGs/isip_ldpc_bg1_i0.h"
#include "../BGs/isip_ldpc_bg1_i1.h"
#include "../BGs/isip_ldpc_bg1_i2.h"
#include "../BGs/isip_ldpc_bg1_i3.h"
#include "../BGs/isip_ldpc_bg1_i4.h"
#include "../BGs/isip_ldpc_bg1_i5.h"
#include "../BGs/isip_ldpc_bg1_i6.h"
#include "../BGs/isip_ldpc_bg1_i7.h"
#include "../BGs/isip_ldpc_bg2_i0.h"
#include "../BGs/isip_ldpc_bg2_i1.h"
#include "../BGs/isip_ldpc_bg2_i2.h"
#include "../BGs/isip_ldpc_bg2_i3.h"
#include "../BGs/isip_ldpc_bg2_i4.h"
#include "../BGs/isip_ldpc_bg2_i5.h"
#include "../BGs/isip_ldpc_bg2_i6.h"
#include "../BGs/isip_ldpc_bg2_i7.h"
#include "util.h"
#define TNPB 35
......
......@@ -9,7 +9,7 @@ gen: clean gen_idx.c
ldpc: clean ldpc.cu
cp ../test_input/8448/$(num).txt $(FILE)
nvcc ldpc.cu -o ldpc -g -G
nvcc -arch=compute_70 -rdc=true ldpc.cu -o ldpc -g -G
mkdir debug
test: ldpc
......
set +x
index=1
for file in ../test_input/8448/1.txt
do
for file in ../test_input/8448/*
do
cp $file channel_output.txt
echo "===== test $index =====" >> log.txt
./ldpc channel_output.txt >> log.txt
......
......@@ -2,15 +2,17 @@
#include <stdlib.h>
#include <string.h>
#ifndef test
#include "isip_ldpc_bg1_i1.h"
#include "../BGs/isip_ldpc_bg1_i1.h"
#endif
int l2c_idx[316*384] = {}; // cnbuf[ tid ] = llr[ l2c_idx[tid] ]
int l2b_idx[316*384] = {}; // bnbuf[ tid ] = llr[ l2b_idx[tid] ]
int llr_idx[26113] = {}; // bnbuf2llr, start = llr_idx[tid], end = llr_idx[tid+1], llrbuf[tid] = sum of (bnbuf[start] to bnbuf[end])
int c2b_idx[316*384] = {}; // bnbuf[ tid ] = cnbuf[ c2b_idx[tid] ]
int b2c_idx[316*384] = {}; // cnbuf[ tid ] = bnbuf[ b2c_idx[tid] ]
int cnproc_idx[316*384*2] = {}; // index for cnproc, start = cnproc_idx[tid], end = cnproc_idx[tid+1]
int bnproc_idx[316*384*2] = {}; // index for bnproc, start = bnproc_idx[tid], end = bnproc_idx[tid+1]
int cnproc_start_idx[316*384] = {}; // index for cnproc, int start = cnproc_start_idx[tid]
int cnproc_end_idx[316*384] = {}; // index for cnproc, int end = cnproc_end_idx[tid]
int bnproc_start_idx[316*384] = {}; // index for bnproc, int start = bnproc_start_idx[tid]
int bnproc_end_idx[316*384] = {}; // index for bnproc, int end = bnproc_end_idx[tid]
int *matrix_transpose(int *matrix, int row, int col)
{
......@@ -89,8 +91,9 @@ void build_index(const int *BG, int row, int col, int Zc)
}
end = start + cnt;
while(cnt--){
bnproc_idx[pidx++] = start;
bnproc_idx[pidx++] = end;
bnproc_start_idx[pidx] = start;
bnproc_end_idx[pidx] = end;
pidx++;
}
}
......@@ -126,8 +129,9 @@ void build_index(const int *BG, int row, int col, int Zc)
}
end = start + cnt;
while(cnt--){
cnproc_idx[pidx++] = start;
cnproc_idx[pidx++] = end;
cnproc_start_idx[pidx] = start;
cnproc_end_idx[pidx] = end;
pidx++;
}
}
......@@ -147,8 +151,10 @@ void generate_header(const char *file, int col, int entry, int Zc)
write_to_file(f, "int l2c_idx", l2c_idx, entry*Zc);
write_to_file(f, "int c2b_idx", c2b_idx, entry*Zc);
write_to_file(f, "int b2c_idx", b2c_idx, entry*Zc);
write_to_file(f, "int cnproc_idx", cnproc_idx, entry*Zc*2);
write_to_file(f, "int bnproc_idx", bnproc_idx, entry*Zc*2);
write_to_file(f, "int cnproc_start_idx", cnproc_start_idx, entry*Zc);
write_to_file(f, "int cnproc_end_idx", cnproc_end_idx, entry*Zc);
write_to_file(f, "int bnproc_start_idx", bnproc_start_idx, entry*Zc);
write_to_file(f, "int bnproc_end_idx", bnproc_end_idx, entry*Zc);
write_to_file(f, "int llr_idx", llr_idx, col*Zc+1);
write_to_file(f, "int l2b_idx", l2b_idx, entry*Zc);
......
......@@ -27,7 +27,7 @@ void print_arr(const char *file, int *arr, int size)
fclose(fp);
}
__global__ void llr2CN(int *llr, int *cnbuf, int *l2c_idx)
__global__ void llr2CN(float *llr, float *cnbuf, int *l2c_idx)
{
int tid = blockIdx.x*blockDim.x + threadIdx.x;
......@@ -35,7 +35,7 @@ __global__ void llr2CN(int *llr, int *cnbuf, int *l2c_idx)
__syncthreads();
}
__global__ void llr2BN(int *llr, int *const_llr, int *l2b_idx)
__global__ void llr2BN(float *llr, float *const_llr, int *l2b_idx)
{
int tid = blockIdx.x*blockDim.x + threadIdx.x;
......@@ -43,12 +43,12 @@ __global__ void llr2BN(int *llr, int *const_llr, int *l2b_idx)
__syncthreads();
}
__global__ void CNProcess(int *cnbuf, int *bnbuf, int *b2c_idx, int *cnproc_idx)
__global__ void CNProcess(float *cnbuf, float *bnbuf, int *b2c_idx, int *cnproc_start_idx, int *cnproc_end_idx)
{
int tid = blockIdx.x*blockDim.x + threadIdx.x;
int start = cnproc_idx[tid*2];
int end = cnproc_idx[tid*2+1];
int start = cnproc_start_idx[tid];
int end = cnproc_end_idx[tid];
int sgn = 1, val = INT32_MAX;
......@@ -68,33 +68,48 @@ __global__ void CNProcess(int *cnbuf, int *bnbuf, int *b2c_idx, int *cnproc_idx)
__syncthreads();
}
__global__ void BNProcess(int *const_llr, int *bnbuf, int *cnbuf, int *c2b_idx, int *bnproc_idx)
__global__ void add(int *bnbuf, int start, int pid, int *buf)
{
__shared__ int sdata[25];
int tid = threadIdx.x;
int num = blockDim.x;
sdata[tid] = bnbuf[start+tid];
for(int s = num/2; s > 0; s>>=1){
if(tid < s){
sdata[tid] += sdata[tid+s];
}
}
if(tid == 0){
buf[pid] = sdata[tid];
}
}
__global__ void BNProcess(float *const_llr, float *bnbuf, float *cnbuf, int *c2b_idx, int *bnproc_start_idx, int *bnproc_end_idx, float *resbuf)
{
int tid = blockIdx.x*blockDim.x + threadIdx.x;
float val = 0.0;
int start = bnproc_idx[tid*2];
int end = bnproc_idx[tid*2+1];
// int arr[35] = {};
// get_data<<<1, end-start>>>(arr)
int val = 0;
int start = bnproc_start_idx[tid];
int end = bnproc_end_idx[tid];
for(int i = start; i < end; i++){
if(i == tid) continue;
val += bnbuf[i];
}
// cnbuf[c2b_idx[tid]] = resbuf[tid] + const_llr[tid];
cnbuf[c2b_idx[tid]] = val + const_llr[tid];
__syncthreads();
}
__global__ void BN2llr(int *const_llr, int *bnbuf, int *llrbuf, int *llr_idx)
__global__ void BN2llr(float *const_llr, float *bnbuf, float *llrbuf, int *llr_idx)
{
int tid = blockIdx.x*blockDim.x + threadIdx.x;
int start = llr_idx[tid];
int end = llr_idx[tid+1];
int res = 0;
int res = 0.0;
for(int i = start; i < end; i++){
res += bnbuf[i];
}
......@@ -102,7 +117,7 @@ __global__ void BN2llr(int *const_llr, int *bnbuf, int *llrbuf, int *llr_idx)
__syncthreads();
}
__global__ void BitDetermination(int *BN, unsigned int *decode_d)
__global__ void BitDetermination(float *BN, unsigned int *decode_d)
{
__shared__ int tmp[256];
int tid = blockIdx.x*256 + threadIdx.x;
......@@ -127,7 +142,7 @@ __global__ void BitDetermination(int *BN, unsigned int *decode_d)
}
}
void Read_Data(char *filename, int *data_sent, int *data_received)
void Read_Data(char *filename, int *data_sent, float *data_received)
{
FILE *fp = fopen(filename, "r");
fscanf(fp, "%*s");
......@@ -138,7 +153,7 @@ void Read_Data(char *filename, int *data_sent, int *data_received)
fscanf(fp, "%*s");
fscanf(fp, "%*s");
for(int i = 0; i < 26112; i++){
fscanf(fp, "%d", &data_received[i]);
fscanf(fp, "%f", &data_received[i]);
}
fclose(fp);
}
......@@ -146,12 +161,12 @@ void Read_Data(char *filename, int *data_sent, int *data_received)
int main(int argc, char **argv)
{
int *input = (int*)malloc(1056*sizeof(int));
int *llr = (int*)malloc(26112*sizeof(int));
float *llr = (float*)malloc(26112*sizeof(float));
int *llr_d, *llrbuf_d, *const_llr_d, *cnbuf_d, *bnbuf_d;
float *llr_d, *llrbuf_d, *const_llr_d, *cnbuf_d, *bnbuf_d, *resbuf_d;
unsigned int *decode_output_h, *decode_output_d;
int *l2c_idx_d, *cnproc_idx_d, *c2b_idx_d, *bnproc_idx_d, *b2c_idx_d, *llr_idx_d, *l2b_idx_d;
int *l2c_idx_d, *cnproc_start_idx_d, *cnproc_end_idx_d, *c2b_idx_d, *bnproc_start_idx_d, *bnproc_end_idx_d, *b2c_idx_d, *llr_idx_d, *l2b_idx_d;
char *file = argv[1];
......@@ -169,25 +184,30 @@ int main(int argc, char **argv)
size_t p_llr;
cudaHostAlloc((void**)&decode_output_h, 1056*sizeof(unsigned int), cudaHostAllocMapped);
cudaMallocPitch((void**)&llr_d, &p_llr, 26112*sizeof(int), 1);
cudaMallocPitch((void**)&llrbuf_d, &p_llr, 26112*sizeof(int), 1);
cudaMallocPitch((void**)&const_llr_d, &p_llr, 316*384*sizeof(int), 1);
cudaMallocPitch((void**)&cnbuf_d, &p_llr, 316*384*sizeof(int), 1);
cudaMallocPitch((void**)&bnbuf_d, &p_llr, 316*384*sizeof(int), 1);
cudaMallocPitch((void**)&llr_d, &p_llr, 26112*sizeof(float), 1);
cudaMallocPitch((void**)&llrbuf_d, &p_llr, 26112*sizeof(float), 1);
cudaMallocPitch((void**)&const_llr_d, &p_llr, 316*384*sizeof(float), 1);
cudaMallocPitch((void**)&cnbuf_d, &p_llr, 316*384*sizeof(float), 1);
cudaMallocPitch((void**)&bnbuf_d, &p_llr, 316*384*sizeof(float), 1);
cudaMallocPitch((void**)&l2c_idx_d, &p_llr, 316*384*sizeof(int), 1);
cudaMallocPitch((void**)&l2b_idx_d, &p_llr, 316*384*sizeof(int), 1);
cudaMallocPitch((void**)&cnproc_idx_d, &p_llr, 316*384*2*sizeof(int), 1);
cudaMallocPitch((void**)&cnproc_start_idx_d, &p_llr, 316*384*sizeof(int), 1);
cudaMallocPitch((void**)&cnproc_end_idx_d, &p_llr, 316*384*sizeof(int), 1);
cudaMallocPitch((void**)&c2b_idx_d, &p_llr, 316*384*sizeof(int), 1);
cudaMallocPitch((void**)&bnproc_idx_d, &p_llr, 316*384*2*sizeof(int), 1);
cudaMallocPitch((void**)&bnproc_start_idx_d, &p_llr, 316*384*sizeof(int), 1);
cudaMallocPitch((void**)&bnproc_end_idx_d, &p_llr, 316*384*sizeof(int), 1);
cudaMallocPitch((void**)&b2c_idx_d, &p_llr, 316*384*sizeof(int), 1);
cudaMallocPitch((void**)&resbuf_d, &p_llr, 316*384*sizeof(int), 1);
cudaMallocPitch((void**)&llr_idx_d, &p_llr, 26113*sizeof(int), 1);
cudaMemcpyAsync((void*)llr_d, (const void*)llr, 68*384*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync((void*)llr_d, (const void*)llr, 68*384*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpyAsync((void*)l2c_idx_d, (const void*)l2c_idx, 316*384*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync((void*)l2b_idx_d, (const void*)l2b_idx, 316*384*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync((void*)cnproc_idx_d, (const void*)cnproc_idx, 316*384*2*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync((void*)cnproc_start_idx_d, (const void*)cnproc_start_idx, 316*384*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync((void*)cnproc_end_idx_d, (const void*)cnproc_end_idx, 316*384*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync((void*)c2b_idx_d, (const void*)c2b_idx, 316*384*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync((void*)bnproc_idx_d, (const void*)bnproc_idx, 316*384*2*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync((void*)bnproc_start_idx_d, (const void*)bnproc_start_idx, 316*384*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync((void*)bnproc_end_idx_d, (const void*)bnproc_end_idx, 316*384*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync((void*)b2c_idx_d, (const void*)b2c_idx, 316*384*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync((void*)llr_idx_d, (const void*)llr_idx, 26113*sizeof(int), cudaMemcpyHostToDevice);
......@@ -216,12 +236,12 @@ int main(int argc, char **argv)
char dir[] = "debug/", cn[] = "cnbuf", bn[] = "bnbuf", llrstr[] = "llrbuf_d";
char str[100] = {};
for(int i = 0; i < rounds; i++){
CNProcess<<<blockNum, threadNum>>>(cnbuf_d, bnbuf_d, b2c_idx_d, cnproc_idx_d);
CNProcess<<<blockNum, threadNum>>>(cnbuf_d, bnbuf_d, b2c_idx_d, cnproc_start_idx_d, cnproc_end_idx_d);
#ifdef debug
snprintf(str, 20, "%s%s_%d", dir, bn, i+1);
print_arr(str, bnbuf_d, 316*384);
#endif
BNProcess<<<blockNum, threadNum>>>(const_llr_d, bnbuf_d, cnbuf_d, c2b_idx_d, bnproc_idx_d);
BNProcess<<<blockNum, threadNum>>>(const_llr_d, bnbuf_d, cnbuf_d, c2b_idx_d, bnproc_start_idx_d, bnproc_end_idx_d, resbuf_d);
#ifdef debug
snprintf(str, 20, "%s%s_%d", dir, cn, i+1);
print_arr(str, cnbuf_d, 316*384);
......@@ -259,12 +279,15 @@ int main(int argc, char **argv)
cudaFree(bnbuf_d);
cudaFree(cnbuf_d);
cudaFree(l2c_idx_d);
cudaFree(cnproc_idx_d);
cudaFree(cnproc_start_idx_d);
cudaFree(cnproc_end_idx_d);
cudaFree(c2b_idx_d);
cudaFree(bnproc_idx_d);
cudaFree(bnproc_start_idx_d);
cudaFree(bnproc_end_idx_d);
cudaFree(b2c_idx_d);
cudaFree(const_llr_d);
cudaFree(llr_idx_d);
cudaFree(resbuf_d);
cudaFreeHost(decode_output_h);
return 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment