LDPC implementation on GPU

Signed-off-by: NCTU CS ISIP <tyhsu@cs.nctu.edu.tw>

LDPC implementation on GPU
Signed-off-by: NCTU CS ISIP <tyhsu@cs.nctu.edu.tw>
1f6dbb20 · NCTU CS ISIP · fb30fc6a · 1f6dbb20 · 1f6dbb20 · 1f6dbb20
Commit 1f6dbb20 authored Mar 06, 2020 by NCTU CS ISIP
129 changed files
--- a/openair1/PHY/CODING/GPU_LDPC/BGs/isip_ldpc_bg1_i0.h
+++ b/openair1/PHY/CODING/GPU_LDPC/BGs/isip_ldpc_bg1_i0.h
--- a/openair1/PHY/CODING/GPU_LDPC/BGs/isip_ldpc_bg1_i1.h
+++ b/openair1/PHY/CODING/GPU_LDPC/BGs/isip_ldpc_bg1_i1.h
--- a/openair1/PHY/CODING/GPU_LDPC/BGs/isip_ldpc_bg1_i2.h
+++ b/openair1/PHY/CODING/GPU_LDPC/BGs/isip_ldpc_bg1_i2.h
--- a/openair1/PHY/CODING/GPU_LDPC/BGs/isip_ldpc_bg1_i3.h
+++ b/openair1/PHY/CODING/GPU_LDPC/BGs/isip_ldpc_bg1_i3.h
--- a/openair1/PHY/CODING/GPU_LDPC/BGs/isip_ldpc_bg1_i4.h
+++ b/openair1/PHY/CODING/GPU_LDPC/BGs/isip_ldpc_bg1_i4.h
--- a/openair1/PHY/CODING/GPU_LDPC/BGs/isip_ldpc_bg1_i5.h
+++ b/openair1/PHY/CODING/GPU_LDPC/BGs/isip_ldpc_bg1_i5.h
--- a/openair1/PHY/CODING/GPU_LDPC/BGs/isip_ldpc_bg1_i6.h
+++ b/openair1/PHY/CODING/GPU_LDPC/BGs/isip_ldpc_bg1_i6.h
--- a/openair1/PHY/CODING/GPU_LDPC/BGs/isip_ldpc_bg1_i7.h
+++ b/openair1/PHY/CODING/GPU_LDPC/BGs/isip_ldpc_bg1_i7.h
--- a/openair1/PHY/CODING/GPU_LDPC/BGs/isip_ldpc_bg2_i0.h
+++ b/openair1/PHY/CODING/GPU_LDPC/BGs/isip_ldpc_bg2_i0.h
--- a/openair1/PHY/CODING/GPU_LDPC/BGs/isip_ldpc_bg2_i1.h
+++ b/openair1/PHY/CODING/GPU_LDPC/BGs/isip_ldpc_bg2_i1.h
--- a/openair1/PHY/CODING/GPU_LDPC/BGs/isip_ldpc_bg2_i2.h
+++ b/openair1/PHY/CODING/GPU_LDPC/BGs/isip_ldpc_bg2_i2.h
--- a/openair1/PHY/CODING/GPU_LDPC/BGs/isip_ldpc_bg2_i3.h
+++ b/openair1/PHY/CODING/GPU_LDPC/BGs/isip_ldpc_bg2_i3.h
--- a/openair1/PHY/CODING/GPU_LDPC/BGs/isip_ldpc_bg2_i4.h
+++ b/openair1/PHY/CODING/GPU_LDPC/BGs/isip_ldpc_bg2_i4.h
--- a/openair1/PHY/CODING/GPU_LDPC/BGs/isip_ldpc_bg2_i5.h
+++ b/openair1/PHY/CODING/GPU_LDPC/BGs/isip_ldpc_bg2_i5.h
--- a/openair1/PHY/CODING/GPU_LDPC/BGs/isip_ldpc_bg2_i6.h
+++ b/openair1/PHY/CODING/GPU_LDPC/BGs/isip_ldpc_bg2_i6.h
--- a/openair1/PHY/CODING/GPU_LDPC/BGs/isip_ldpc_bg2_i7.h
+++ b/openair1/PHY/CODING/GPU_LDPC/BGs/isip_ldpc_bg2_i7.h
--- a/openair1/PHY/CODING/GPU_LDPC/README.md
+++ b/openair1/PHY/CODING/GPU_LDPC/README.md
+### LDPC DECODER 
+ `intuitive_ldpc`: 
+	Intuitive methods of LDPC decoder. This method stores the whole expanded Base Graph, CN, BN buffer as they are. Then, does the cn and bn process according to the equation in `nrLDPC.pdf`.
+
+ `optimized_ldpc`: 
+	This version inherits the idea of current oai ldpc_decoder. It stores lots of LUTs and shrinks the cn, bnbuffer size comparing to the former implementation. Right now, it only supports the longest code block 8448.
+
+### Usage
+ `make ldpc` will compile the program `ldpc`.
+ `make prof [num=<0~100>]` will show the detail of GPU activity.
+ `make test` will build ldpc executable and run `check.sh` to verify the correctness of the implementation.
+
+### Verification
+ The input (channel output) of the LDPC decoder is produced by the `ldpctest` program from OAI.
+ The verification is done by comparing the data decoded with the input data (channel output), and also the data output(`estimated_output`) produced by oai `ldpctest`.
+
+
--- a/openair1/PHY/CODING/GPU_LDPC/intuitive_ldpc/Makefile
+++ b/openair1/PHY/CODING/GPU_LDPC/intuitive_ldpc/Makefile
+NVCC = nvcc
+CFLAGS = 
+PROF = nvprof
+exec = ldpc
+file = channel_output.txt 
+OBJS = util.o
+code = 8448
+num = 1
+.PHONY: clean debug
+
+
+## Target-specific Variable Values
+debug: CFLAGS += -Ddebug -g -G
+debug: clean $(exec) 
+	cuda-gdb -q --args $(exec) -l $(code) -f $(file)
+
+$(exec): $(exec).cu $(OBJS)
+	$(NVCC) $(CFLAGS) -o $@ $^
+
+%.o: %.cu
+	cp ../test_input/8448/$(num).txt $(file)
+	$(NVCC) $(CFLAGS) -o $@ -c $<
+
+run: clean $(exec)
+	./$(exec) -l $(code) -f $(file)
+
+prof: clean $(exec)
+	cp ../test_input/8448/$(num).txt $(file)
+	$(PROF) ./$(exec) -l $(code) -f  $(file)
+
+test: clean $(exec)
+	./check.sh $(code)
+	rm $(OBJS) $(file)
+
+clean:
+	rm -rf *.o test.* $(exec) CN* BN* log.txt channel_output.txt
--- a/openair1/PHY/CODING/GPU_LDPC/intuitive_ldpc/check.sh
+++ b/openair1/PHY/CODING/GPU_LDPC/intuitive_ldpc/check.sh
+#!/bin/bash
+
+exec=ldpc
+
+help()
+{
+	echo "Usage: $0 <code_length>"
+}
+
+main()
+{
+	if [ -z $@ ]; then
+		help
+		exit 1
+	fi
+
+	dir=$@
+	files=`ls ../test_input/$dir | grep txt`
+#	files="1.txt 2.txt 3.txt"
+	
+	index=1
+	for file in $files
+		do 
+			cp ../test_input/$dir/$file channel_output.txt
+			echo "==== test $index ====" >> log.txt
+			./$exec -l $dir -f channel_output.txt >> log.txt
+			echo ' ' >> log.txt
+			index=$((index+1))
+		done
+}
+
+main $@
--- a/openair1/PHY/CODING/GPU_LDPC/intuitive_ldpc/ldpc.cu
+++ b/openair1/PHY/CODING/GPU_LDPC/intuitive_ldpc/ldpc.cu
+#include <stdio.h>
+#include <cuda_runtime.h>
+#include <cuda.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <string.h>
+#include "../BGs/isip_ldpc_bg1_i0.h"
+#include "../BGs/isip_ldpc_bg1_i1.h"
+#include "../BGs/isip_ldpc_bg1_i2.h"
+#include "../BGs/isip_ldpc_bg1_i3.h"
+#include "../BGs/isip_ldpc_bg1_i4.h"
+#include "../BGs/isip_ldpc_bg1_i5.h"
+#include "../BGs/isip_ldpc_bg1_i6.h"
+#include "../BGs/isip_ldpc_bg1_i7.h"
+#include "../BGs/isip_ldpc_bg2_i0.h"
+#include "../BGs/isip_ldpc_bg2_i1.h"
+#include "../BGs/isip_ldpc_bg2_i2.h"
+#include "../BGs/isip_ldpc_bg2_i3.h"
+#include "../BGs/isip_ldpc_bg2_i4.h"
+#include "../BGs/isip_ldpc_bg2_i5.h"
+#include "../BGs/isip_ldpc_bg2_i6.h"
+#include "../BGs/isip_ldpc_bg2_i7.h"
+#include "util.h"
+
+#define TNPB 35
+#define BNPG 1024
+#define ITER 5
+
+__constant__ int BG_GPU[46*68];
+
+
+
+__global__ 
+void BNProcess(int flag, int *BN, int *CN, int *CNbuf, const int *const_llr, int BG_col, int BG_row, int Zc)
+{
+	int *CNG = (flag)? CN : CNbuf;
+	int id = blockIdx.x*blockDim.x + threadIdx.x;
+
+	for(int col = id; col < BG_col*Zc; col += (TNPB*BNPG))
+	{
+		int tmp = const_llr[col];
+		for(int row = 0; row < BG_row; row++)
+		{
+			int up_shift = (BG_GPU[col/Zc + row*BG_col] - 1)%Zc;
+			if(up_shift != -1)
+			{
+				int row_idx = col%Zc;
+				row_idx = row_idx - up_shift;
+				if(row_idx < 0)	row_idx = Zc + row_idx;
+
+				row_idx = row*Zc + row_idx;
+				tmp = tmp + CNG[row_idx*BG_col*Zc + col];
+			}
+		}
+		BN[col] = tmp;
+	}
+	__syncthreads();
+}
+
+
+
+__global__ void CNProcess(int flag, int *BN, int *CN, int *CNbuf, int BG_col, int BG_row, int Zc)
+{
+	int *CNG	= (flag)? CN : CNbuf;
+	int *SCNG 	= (flag)? CNbuf : CN;
+	int id = blockIdx.x*blockDim.x + threadIdx.x;
+	
+	for(int row = id; row < BG_row*Zc; row += (TNPB*BNPG))
+	{
+		for(int col = 0; col < BG_col; col++)
+		{
+			int right_shift = BG_GPU[(row/Zc)*BG_col + col] -1;
+			if(right_shift != -1)
+			{
+				int row_idx = row;
+//				int col_idx = ((row%384) + right_shift%384) %384 + col*384;
+                int col_idx = (row + right_shift) %Zc + col*Zc;
+				int sgn_cnt = 0, min = INT32_MAX;
+				for(int comp = 0; comp < BG_col; comp++)
+				{
+					if(comp == col)	continue;
+					int comp_right_shift = BG_GPU[(row/Zc)*BG_col + comp] -1;
+					if(comp_right_shift != -1)
+					{
+						int comp_row_idx = row;
+//						int comp_col_idx = ((row%384) + (comp_right_shift%384)) %384 + comp*384;
+                        int comp_col_idx = (row + comp_right_shift) %Zc + comp*Zc;
+						int tmp = BN[comp_col_idx] - CNG[comp_row_idx*BG_col*Zc + comp_col_idx];
+						if(tmp < 0)
+						{
+							tmp = -tmp;
+							sgn_cnt++;
+						}
+						if(tmp < min)	min = tmp;
+					}
+				}
+				SCNG[row_idx*BG_col*Zc + col_idx] = (sgn_cnt%2 == 0)? min: -min;
+			}
+		}
+	}
+	__syncthreads();
+}
+
+__global__ void BitDetermination(int *BN, unsigned int *decode_d)
+{
+	__shared__ int tmp[256];
+	int tid = blockIdx.x*256 + threadIdx.x;
+	int bid = threadIdx.x;
+	tmp[bid] = 0;
+	
+	
+	if(BN[tid] < 0)
+	{
+		tmp[bid] = 1 << (bid&7);
+	}
+
+	__syncthreads();
+	
+	if(threadIdx.x < 32)
+	{
+		decode_d[blockIdx.x*32 + threadIdx.x] = 0;
+		for(int i = 0; i < 8; i++)
+		{
+			decode_d[blockIdx.x*32 + threadIdx.x] += tmp[threadIdx.x*8+i];
+		}
+	}
+}
+
+
+// helper function 
+void printllr(const char *name, int *src, int *des, int count, int type_size)
+{
+	cudaCheck( cudaMemcpy((void *)des, (const void *)src, count*type_size, cudaMemcpyDeviceToHost) );
+
+	FILE *fp = fopen(name, "w");
+	if(!fp)	printf("[error]: open file %s failed\n", name);
+
+	for(int i = 0; i < count; i++){
+		fprintf(fp, "llr[%d]= %d\n", i, des[i]);
+	}
+}
+
+
+int main(int argc, char* argv[])
+{
+	int opt = 0, block_length = 0, BG = 0, Kb = 0, Zc = 0, BG_row = 0, BG_col = 0, lift_index = 0;
+	char file[50] = {};
+	short lift_size[51] = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18,20,22,24,26,28,30,32,36,40,44,48,52,56,60,64,72,80,88,96,104,112,120,128,144,160,176,192,208,224,240,256,288,320,352,384};
+	short lift_set[][9] = {
+		{2,4,8,16,32,64,128,256},
+		{3,6,12,24,48,96,192,384},
+		{5,10,20,40,80,160,320},
+		{7,14,28,56,112,224},
+		{9,18,36,72,144,288},
+		{11,22,44,88,176,352},
+		{13,26,52,104,208},
+		{15,30,60,120,240},
+		{0}};
+
+	while( (opt = getopt(argc, argv, "l:f:")) != -1){
+		switch(opt){
+			case 'l':
+				block_length = atoi(optarg);
+				break;
+			case 'f':
+				strncpy(file, optarg, strlen(optarg));
+				break;
+			default:
+				fprintf(stderr, "Usage: %s [-l code block length] <-f input file>\n", argv[0]);
+				exit(1);
+		}
+	}
+	
+	if(block_length == 0 || file[0] == ' '){
+		fprintf(stderr, "no input file specified or code block length == 0");
+	}
+
+	if(block_length > 3840){
+		BG = 1;
+		Kb = 22;
+		BG_row = 46;
+		BG_col = 68;
+	}else if(block_length <= 3840){
+		BG = 2;
+		BG_row = 42;
+		BG_col = 52;
+		if(block_length > 640)
+			Kb = 10;
+		else if(block_length > 560)
+			Kb = 9;
+		else if(block_length > 192)
+			Kb = 8;
+		else
+			Kb = 6;
+	}
+
+	for(int i = 0; i < 51; i++){
+		if(lift_size[i] >= (double)block_length/Kb){
+			Zc = lift_size[i];
+			break;
+		}
+	}
+	
+	for(int i = 0; lift_set[i][0] != 0; i++){
+		for(int j = 0; lift_set[i][j] != 0; j++){
+			if(Zc == lift_set[i][j]){
+				lift_index = i;
+				break;
+			}
+		}
+	}
+
+	int *BG_CPU = NULL;
+	switch(lift_index){
+		case 0:
+			BG_CPU = (BG == 1)? BG1_I0:BG2_I0;
+			break;
+		case 1:
+			BG_CPU = (BG == 1)? BG1_I1:BG2_I1;
+			break;
+		case 2:
+			BG_CPU = (BG == 1)? BG1_I2:BG2_I2;
+			break;
+		case 3:
+			BG_CPU = (BG == 1)? BG1_I3:BG2_I3;
+			break;
+		case 4:
+			BG_CPU = (BG == 1)? BG1_I4:BG2_I4;
+			break;
+		case 5:
+			BG_CPU = (BG == 1)? BG1_I5:BG2_I5;
+			break;
+		case 6:
+			BG_CPU = (BG == 1)? BG1_I6:BG2_I6;
+			break;
+		case 7:
+			BG_CPU = (BG == 1)? BG1_I7:BG2_I7;
+			break;
+	}
+    
+//    printf("BG %d lift_index %d Zc %d BG_row %d BG_col %d\n", BG, lift_index, Zc, BG_row, BG_col);
+    
+	// alloc cpu memory
+ 	unsigned int *input = (unsigned int*)malloc(sizeof(unsigned int)*8448/8), *decode_output_d, *decode_output_h;
+	int *BN, *CN, *CNbuf, *channel_output, *const_llr;
+//	debug
+//	int *p_BN = (int*)calloc(68*384, sizeof(int));
+//	int *p_CN = (int*)calloc(68*384*46*384, sizeof(int));
+	int *debug_llr = (int*)calloc(68*384, sizeof(int));
+	cudaCheck( cudaHostAlloc((void**)&channel_output, 68*384*sizeof(int), cudaHostAllocDefault) );
+	cudaCheck( cudaHostAlloc((void**)&decode_output_h, (8448/8)*sizeof(unsigned int), cudaHostAllocMapped) );
+	// | cudaHostAllocPortable | cudaHostAllocMapped | cudaHostAllocWriteCombined);
+	
+	// read data from input file
+	ReadDataFromFile(file, input, channel_output, block_length, BG_col, Zc);
+	
+	// alloc gpu memory 
+	// BG
+	cudaCheck( cudaMemcpyToSymbol(BG_GPU, BG_CPU, BG_col*BG_row*sizeof(int)) );
+	// LLR CN BN BUF
+	size_t p_llr;
+	cudaCheck( cudaMallocPitch((void**)&const_llr, &p_llr, 68*384*sizeof(int), 1) );
+	cudaCheck( cudaMallocPitch((void**)&BN, &p_llr, 68*384*sizeof(int), 1) );
+	cudaCheck( cudaMallocPitch((void**)&CN, &p_llr, 68*384*sizeof(int), 46*384) );
+	cudaCheck( cudaMallocPitch((void**)&CNbuf, &p_llr, 68*384*sizeof(int), 46*384) );
+
+	cudaCheck( cudaMemcpyAsync((void*)const_llr, (const void*)channel_output, 68*384*sizeof(int), cudaMemcpyHostToDevice) );
+	cudaCheck( cudaMemcpyAsync((void*)BN, (const void*)channel_output, 68*384*sizeof(int), cudaMemcpyHostToDevice) );
+	cudaCheck( cudaHostGetDevicePointer((void**)&decode_output_d, (void*)decode_output_h, 0) );
+	cudaDeviceSynchronize();
+	
+	cudaEvent_t start, end;
+	float time;
+	
+	cudaEventCreate(&start);
+	cudaEventCreate(&end);
+	
+	cudaEventRecord(start,0);
+
+	dim3 grid(BNPG, 1, 1);
+	dim3 block(TNPB, 1, 1);
+	int flag = 0;
+	char str[20] = {};
+	for(int it = 0; it < ITER; it++){
+		CNProcess<<<grid, block>>>(flag, BN, CN, CNbuf, BG_col, BG_row, Zc);
+		flag = (flag+1)&1;
+		BNProcess<<<grid, block>>>(flag, BN, CN, CNbuf, const_llr, BG_col, BG_row, Zc);
+#ifdef debug 
+		snprintf(str, 20,  "%s_%d", "llr", it);
+		printllr(str, BN, debug_llr, 68*384, sizeof(int));
+#endif
+	}
+	BitDetermination<<<33, 256>>>(BN, decode_output_d);
+	
+	cudaDeviceSynchronize();
+	cudaEventRecord(end,0);
+	cudaEventSynchronize(end);
+	cudaEventElapsedTime(&time, start, end);
+
+	
+	int err_num = 0;
+	for(int i = 0; i < block_length/8; i++){
+		if(input[i] != decode_output_h[i]){
+			printf("input[%d] = %d, decode_output[%d] = %d\n", i, input[i], i, decode_output_h[i]);
+			err_num++;
+		}
+	}
+	printf("err_num == %d\n", err_num);
+	printf("decode time:%f ms\n",time);
+	
+	// free resource 
+	free(input);
+//	free(p_BN);
+//	free(p_CN);
+	cudaFreeHost(channel_output);
+	cudaFreeHost(decode_output_h);
+	cudaFree(const_llr);
+	cudaFree(BN);
+	cudaFree(CN);
+	cudaFree(CNbuf);
+	
+	return 0;
+}
--- a/openair1/PHY/CODING/GPU_LDPC/intuitive_ldpc/util.cu
+++ b/openair1/PHY/CODING/GPU_LDPC/intuitive_ldpc/util.cu
+#include <stdio.h>
+#include <cuda_runtime.h>
+#include "util.h"
+
+
+void ReadDataFromFile(const char *file, unsigned int *input_data_arr, int *channel_data_arr, int block_length, int BG_col, int Zc)
+{
+//	static const char testin[] = "../test_input/test_case_1.txt";
+	file_t inputfile;
+	strcpy(inputfile.filename, file);
+	inputfile.fptr = fopen(inputfile.filename, "r");
+	if(inputfile.fptr == NULL)
+	{	
+		puts("cannot open file");
+	}
+	
+	// data processing
+	fgets(inputfile.tmp, 100, inputfile.fptr);	// get rid of gen test
+	for(int i = 0; i < block_length/8; i++)
+	{
+		fscanf(inputfile.fptr, "%d", &input_data_arr[i]);
+	}
+	fgets(inputfile.tmp, 100, inputfile.fptr);	// get rid of '\n'
+	fgets(inputfile.tmp, 100, inputfile.fptr);	// get rid of test end
+	fgets(inputfile.tmp, 100, inputfile.fptr);	// get rid of channel
+	/*
+	for(int i = 0; i < 2*384; i++)
+	{
+		channel_data_arr[i] = 0;
+	}
+	*/
+	for(int i = 0; i < BG_col*Zc; i++)
+	{
+		fscanf(inputfile.fptr, "%d", &channel_data_arr[i]);
+	}
+	fclose(inputfile.fptr);
+}
--- a/openair1/PHY/CODING/GPU_LDPC/intuitive_ldpc/util.h
+++ b/openair1/PHY/CODING/GPU_LDPC/intuitive_ldpc/util.h
+#ifndef __UTIL_H__
+#define __UTIL_H__
+
+// cuda check macro
+#define cudaCheck(ans) { cudaAssert((ans), __FILE__, __LINE__); }
+inline void cudaAssert(cudaError_t code, const char *file, int line)
+{
+   if (code != cudaSuccess){
+      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+      exit(code);
+   }
+}
+
+// struct
+typedef struct
+{
+	FILE* fptr;
+	char filename[64];
+	char tmp[64];
+}file_t;
+
+// utility
+void ReadDataFromFile(const char*, unsigned int*, int*, int, int, int);
+__global__ void BNProcess(int flag, int *BN, int *CN, int *CNbuf, const int *const_llr);
+__global__ void CNProcess(int flag, int *BN, int *CN, int *CNbuf);
+__global__ void BitDetermination(int *BN, unsigned int *decode_d);
+
+/*
+int32_t nrLDPC_decoder(t_nrLDPC_dec_params* p_decParams, 
+					   int8_t* p_llr, int8_t* p_out, 
+					   t_nrLDPC_procBuf* p_procBuf, 
+					   t_nrLDPC_time_stats* p_profiler){
+
+	return iter;
+}
+
+static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, 
+										   t_nrLDPC_procBuf* p_procBuf, 
+										   uint32_t numLLR, t_nrLDPC_lut* p_lut, 
+										   t_nrLDPC_dec_params* p_decParams, 
+										   t_nrLDPC_time_stats* p_profiler){
+
+	return iter;							
+}
+
+*/
+#endif
--- a/openair1/PHY/CODING/GPU_LDPC/optimized_ldpc/Makefile
+++ b/openair1/PHY/CODING/GPU_LDPC/optimized_ldpc/Makefile
+EXEC = gen_idx
+TXT = gen_matrix.txt
+DEBUG = debug
+FILE = channel_output.txt
+num = 1
+CFLAGS=
+
+gen: clean gen_idx.c
+	$(CC) $(EXEC).c -o $(EXEC) -g
+
+ldpc: clean ldpc.cu 
+	cp ../test_input/8448/$(num).txt $(FILE)
+	nvcc $(CFLAGS) ldpc.cu -o ldpc
+	mkdir debug
+
+test: ldpc
+	./check.sh
+
+cgdb: CFLAGS += -g -G
+cgdb: clean ldpc
+	cuda-gdb --silent --args ldpc $(FILE)
+
+prof: clean ldpc
+	cp ../test_input/8448/$(num).txt $(FILE)
+	nvprof ./ldpc $(FILE)
+
+.PHONY: clean
+clean:
+	$(RM) -rf $(DEBUG) $(EXEC) $(TXT) ldpc log.txt $(FILE)
--- a/openair1/PHY/CODING/GPU_LDPC/optimized_ldpc/bg1_i1_index_array.h
+++ b/openair1/PHY/CODING/GPU_LDPC/optimized_ldpc/bg1_i1_index_array.h
--- a/openair1/PHY/CODING/GPU_LDPC/optimized_ldpc/check.sh
+++ b/openair1/PHY/CODING/GPU_LDPC/optimized_ldpc/check.sh
+set +x
+index=1
+for file in ../test_input/8448/*
+do 
+		cp $file channel_output.txt
+		echo "===== test $index =====" >> log.txt
+		./ldpc channel_output.txt >> log.txt
+		echo ' ' >> log.txt
+		index=$((index+1))
+	done
--- a/openair1/PHY/CODING/GPU_LDPC/optimized_ldpc/gen_idx.c
+++ b/openair1/PHY/CODING/GPU_LDPC/optimized_ldpc/gen_idx.c
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifndef test
+#include "../BGs/isip_ldpc_bg1_i1.h"
+#endif
+int l2c_idx[316*384] = {};		// cnbuf[ tid ] = llr[ l2c_idx[tid] ]
+int l2b_idx[316*384] = {};		// bnbuf[ tid ] = llr[ l2b_idx[tid] ]
+int llr_idx[26113] = {};		// bnbuf2llr, start = llr_idx[tid], end = llr_idx[tid+1], llrbuf[tid] = sum of (bnbuf[start] to bnbuf[end])
+int c2b_idx[316*384] = {};    	// bnbuf[ tid ] = cnbuf[ c2b_idx[tid] ]
+int b2c_idx[316*384] = {};		// cnbuf[ tid ] = bnbuf[ b2c_idx[tid] ]
+int cnproc_start_idx[316*384] = {};	// index for cnproc, int start = cnproc_start_idx[tid]
+int cnproc_end_idx[316*384] = {};	// index for cnproc, int end = cnproc_end_idx[tid]
+int bnproc_start_idx[316*384] = {};	// index for bnproc, int start = bnproc_start_idx[tid]
+int bnproc_end_idx[316*384] = {};	// index for bnproc, int end = bnproc_end_idx[tid]
+
+int *matrix_transpose(int *matrix, int row, int col)
+{
+
+}
+
+void write_to_file(FILE *fp, const char *arr_name, int *arr, int size)
+{
+	fprintf(fp, "%s[%d] = { %d", arr_name, size, arr[0]);
+	for(int i = 1; i < size; i++){
+		fprintf(fp, ", %d", arr[i]);
+	}
+	fprintf(fp, "};\n");
+}
+
+void matrix_expansion(const int *BG, int row, int col, int Zc, int *matrix)
+{
+	for(int i = 0; i < row; i++){
+		for(int j = 0; j < col; j++){
+			int val = BG[i * col + j];
+			if(val != 0){
+				val = (val-1)%Zc;
+				for(int k = i*Zc; k < (i+1)*Zc; k++){
+					int idx = k*(col*Zc) + (j*Zc+val);
+					matrix[idx] = 1;
+					val = (val+1)%Zc;
+				}
+			}
+		}
+	}
+}
+
+void print_matrix(const int *M, int row, int col)
+{
+	printf("Matrix:\n{\n");
+	for(int i = 0; i < row; i++){
+		printf("\t");
+		for(int j = 0; j < col; j++){
+			printf(" %d,", M[i*col + j]);
+		}
+		printf("\n");
+	}
+	printf("}\n");
+}
+
+
+void build_index(const int *BG, int row, int col, int Zc)
+{
+	int *matrix = (int*)malloc(row*col*Zc*Zc*sizeof(int));
+	int cnidx = 0, bnidx = 0, aidx = 0, pidx = 0, lidx1 = 0, lidx2 = 0;
+	
+	matrix_expansion(BG, row, col, Zc, matrix);
+
+	// cn label & l2c_idx
+	for(int i = 0; i < row*Zc; i++){
+		for(int j = 0; j < col*Zc; j++){
+			int k = i*col*Zc + j;
+			if(matrix[k] == 1){
+				matrix[k] = cnidx+1;
+				l2c_idx[cnidx] = j;
+				cnidx++;
+			}
+		}
+	}
+
+	// build c2b_idx & bnproc_idx
+	for(int i = 0; i < col*Zc; i++){
+		int cnt = 0;
+		int start = aidx, end = 0;
+		for(int j = 0; j < row*Zc; j++){
+			int val = matrix[i + j*col*Zc];
+			if(val != 0){
+				c2b_idx[aidx] = val -1;
+				aidx++, cnt++;
+			}
+		}
+		end = start + cnt;
+		while(cnt--){
+			bnproc_start_idx[pidx] = start;
+			bnproc_end_idx[pidx] = end;
+			pidx++;
+		}
+	}
+
+	printf("cnidx %d\naidx %d\npidx %d\n", cnidx, aidx, pidx);
+
+	matrix_expansion(BG, row, col, Zc, matrix);
+
+	// bn label & llr_idx
+	for(int i = 0; i < col*Zc; i++){
+		llr_idx[lidx1++] = bnidx;
+		for(int j = 0; j < row*Zc; j++){
+			int k = i + j*col*Zc;
+			if(matrix[k] == 1){
+				matrix[k] = bnidx+1;
+				l2b_idx[lidx2++] = i;
+				bnidx++;
+			}
+		}
+	}
+	llr_idx[lidx1] = bnidx;
+
+	// build b2c_idx & cnproc_idx
+	aidx = pidx = 0;
+	for(int i = 0; i < row*Zc; i++){
+		int cnt = 0;
+		int start = aidx, end = 0;
+		for(int j = 0; j < col*Zc; j++){
+			int val = matrix[i*col*Zc + j];
+			if(val != 0){
+				b2c_idx[aidx] = val - 1;
+				aidx++, cnt++;
+			}
+		}
+		end = start + cnt;
+		while(cnt--){
+			cnproc_start_idx[pidx] = start;
+			cnproc_end_idx[pidx] = end;
+			pidx++;
+		}
+	}	
+	
+	printf("bnidx %d\naidx %d\npidx %d\n", bnidx, aidx, pidx);
+	printf("lidx1 %d\nlidx2 %d\n", lidx1, lidx2);
+
+	free(matrix);
+}
+
+void generate_header(const char *file, int col, int entry, int Zc)
+{
+#ifdef test
+	FILE *f = fopen("test.h", "w");
+#else
+	FILE *f = fopen(file, "w");
+#endif
+	write_to_file(f, "int l2c_idx", l2c_idx, entry*Zc);
+	write_to_file(f, "int c2b_idx", c2b_idx, entry*Zc);
+	write_to_file(f, "int b2c_idx", b2c_idx, entry*Zc);
+	write_to_file(f, "int cnproc_start_idx", cnproc_start_idx, entry*Zc);
+	write_to_file(f, "int cnproc_end_idx", cnproc_end_idx, entry*Zc);
+	write_to_file(f, "int bnproc_start_idx", bnproc_start_idx, entry*Zc);
+	write_to_file(f, "int bnproc_end_idx", bnproc_end_idx, entry*Zc);
+	write_to_file(f, "int llr_idx", llr_idx, col*Zc+1);
+	write_to_file(f, "int l2b_idx", l2b_idx, entry*Zc);
+
+	fclose(f);
+}
+
+
+int main(int argc, char** argv)
+{
+	int test_BG[4*5] ={ 2, 0, 1, 2, 0,   
+    					0, 3, 0, 2, 2, 
+						1, 0, 3, 3, 0, 
+				   		0, 3, 0, 0, 1};
+	// default
+    int idx = 0, Zc = 384, BG = 1;
+    int max_row = 46, max_col = 68;
+	int entry = 316;
+	char *header_file = "bg1_i1_index_array.h";
+#ifdef test
+	if(argc == 1){
+		printf("\nusage: %s [BG_row] [BG_col] [Zc] [entry] [output_header]\n\n", argv[0]);
+		return -1;
+	}
+#endif
+    if(argc == 6){
+        max_row = atoi(argv[1]);
+        max_col = atoi(argv[2]);
+        Zc      = atoi(argv[3]);
+		entry	= atoi(argv[4]);
+		header_file	= argv[5];
+		printf("max_row %d, max_col %d, Zc %d\n", max_row, max_col, Zc);
+    }
+#ifdef test
+	build_index(test_BG, max_row, max_col, Zc);
+#else
+	build_index(BG1_I1, max_row, max_col, Zc);
+#endif
+	generate_header(header_file, max_col, entry, Zc);
+
+    return 0;
+}
--- a/openair1/PHY/CODING/GPU_LDPC/optimized_ldpc/ldpc.cu
+++ b/openair1/PHY/CODING/GPU_LDPC/optimized_ldpc/ldpc.cu
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <cuda_runtime.h>
+#include <cuda.h>
+#include "bg1_i1_index_array.h"
+
+void print_arr_cpu(const char *file, int *arr, int size)
+{
+	FILE *fp = fopen(file, "w");
+	for(int i = 0; i < size; i++){
+		fprintf(fp, "%s[%d]: %d\n", file, i, arr[i]);
+	}
+	fclose(fp);
+}
+
+void print_arr(const char *file, int *arr, int size)
+{
+	int *tmp = (int*)malloc(sizeof(int)*size);
+	FILE *fp = fopen(file, "w");
+	cudaMemcpy((void*)tmp, (const void*)arr, size*sizeof(int), cudaMemcpyDeviceToHost);
+	cudaDeviceSynchronize();
+	for(int i = 0; i < size; i++){
+		fprintf(fp, "%s[%d]: %d\n", file, i, tmp[i]);
+	}
+	free(tmp);
+	fclose(fp);
+}
+
+__global__ void llr2CN(float *llr, float *cnbuf, int *l2c_idx)
+{
+	int tid = blockIdx.x*blockDim.x + threadIdx.x;
+
+	cnbuf[tid] = llr[l2c_idx[tid]];
+	__syncthreads();
+}
+
+__global__ void llr2BN(float *llr, float *const_llr, int *l2b_idx)
+{
+	int tid = blockIdx.x*blockDim.x + threadIdx.x;
+
+	const_llr[tid] = llr[l2b_idx[tid]];
+	__syncthreads();
+}
+
+__global__ void CNProcess(float *cnbuf, float *bnbuf, int *b2c_idx, int *cnproc_start_idx, int *cnproc_end_idx)
+{
+	int tid = blockIdx.x*blockDim.x + threadIdx.x;
+
+	int start = cnproc_start_idx[tid];
+	int end = cnproc_end_idx[tid];
+	
+	
+	int sgn = 1, val = INT32_MAX;
+	for(int i = start; i < end; i++){
+		if(i == tid)	continue;
+
+		int tmp = cnbuf[i];
+		if(tmp < 0){
+			tmp = -tmp;
+			sgn = -sgn;
+		}
+		if(val > tmp){
+			val = tmp;
+		}
+	}
+	bnbuf[b2c_idx[tid]] = sgn*val;// + const_llr[tid];
+	__syncthreads();
+}
+
+__global__ void add(int *bnbuf, int start, int pid, int *buf)
+{
+	__shared__ int sdata[25];
+	int tid = threadIdx.x;
+	int num = blockDim.x;
+	sdata[tid] = bnbuf[start+tid];
+	for(int s = num/2; s > 0; s>>=1){
+		if(tid < s){
+			sdata[tid] += sdata[tid+s];
+		}
+	}
+	if(tid == 0){
+		buf[pid] = sdata[tid];
+	}
+}
+
+__global__ void BNProcess(float *const_llr, float *bnbuf, float *cnbuf, int *c2b_idx, int *bnproc_start_idx, int *bnproc_end_idx, float *resbuf)
+{
+	int tid = blockIdx.x*blockDim.x + threadIdx.x;
+	float val = 0.0;
+	
+	int start = bnproc_start_idx[tid];
+	int end = bnproc_end_idx[tid];
+	for(int i = start; i < end; i++){
+		if(i == tid)	continue;
+		val += bnbuf[i];
+	}
+	
+//	cnbuf[c2b_idx[tid]] = resbuf[tid] + const_llr[tid];
+	cnbuf[c2b_idx[tid]] = val + const_llr[tid];
+	__syncthreads();
+}
+
+
+__global__ void BN2llr(float *const_llr, float *bnbuf, float *llrbuf, int *llr_idx)
+{
+	int tid = blockIdx.x*blockDim.x + threadIdx.x;
+
+	int start = llr_idx[tid];
+	int end = llr_idx[tid+1];
+
+	int res = 0.0;
+	for(int i = start; i < end; i++){
+		res += bnbuf[i];
+	}
+	llrbuf[tid] = res + const_llr[tid];
+	__syncthreads();
+}
+
+__global__ void BitDetermination(float *BN, unsigned int *decode_d)
+{
+	__shared__ int tmp[256];
+	int tid = blockIdx.x*256 + threadIdx.x;
+	int bid = threadIdx.x;
+	tmp[bid] = 0;
+	
+	
+	if(BN[tid] < 0)
+	{
+		tmp[bid] = 1 << (bid&7);
+	}
+
+	__syncthreads();
+	
+	if(threadIdx.x < 32)
+	{
+		decode_d[blockIdx.x*32 + threadIdx.x] = 0;
+		for(int i = 0; i < 8; i++)
+		{
+			decode_d[blockIdx.x*32 + threadIdx.x] += tmp[threadIdx.x*8+i];
+		}
+	}
+}
+
+void Read_Data(char *filename, int *data_sent, float *data_received)
+{
+	FILE *fp = fopen(filename, "r");
+	fscanf(fp, "%*s");
+	for(int i = 0; i < 1056; i++){
+		fscanf(fp, "%d", &data_sent[i]);
+	}
+	fscanf(fp, "%*s");
+	fscanf(fp, "%*s");
+	fscanf(fp, "%*s");
+	for(int i = 0; i < 26112; i++){
+		fscanf(fp, "%f", &data_received[i]);
+	}
+	fclose(fp);
+}
+
+int main(int argc, char **argv)
+{
+	int code_length = 8448, BG = 1;
+	int *input = (int*)malloc(1056*sizeof(int));
+	float *llr = (float*)malloc(26112*sizeof(float));
+
+	float *llr_d, *llrbuf_d, *const_llr_d, *cnbuf_d, *bnbuf_d, *resbuf_d;
+	unsigned int *decode_output_h, *decode_output_d;
+
+	int *l2c_idx_d, *cnproc_start_idx_d, *cnproc_end_idx_d, *c2b_idx_d, *bnproc_start_idx_d, *bnproc_end_idx_d, *b2c_idx_d, *llr_idx_d, *l2b_idx_d;
+
+	char *file = argv[1];
+	
+	
+	int blockNum = 237, threadNum = 512;
+	//int blockNum = 33, threadNum = 256;
+	//int blockNum = 17, threadNum = 512;
+
+	int rounds = 5, Zc = 384;
+
+	Read_Data(file, input, llr);
+
+
+
+	size_t p_llr;
+	cudaHostAlloc((void**)&decode_output_h, 1056*sizeof(unsigned int), cudaHostAllocMapped);
+
+	cudaMallocPitch((void**)&llr_d, &p_llr, 26112*sizeof(float), 1);
+	cudaMallocPitch((void**)&llrbuf_d, &p_llr, 26112*sizeof(float), 1);
+	cudaMallocPitch((void**)&const_llr_d, &p_llr, 316*384*sizeof(float), 1);
+	cudaMallocPitch((void**)&cnbuf_d, &p_llr, 316*384*sizeof(float), 1);
+	cudaMallocPitch((void**)&bnbuf_d, &p_llr, 316*384*sizeof(float), 1);
+	cudaMallocPitch((void**)&l2c_idx_d, &p_llr, 316*384*sizeof(int), 1);
+	cudaMallocPitch((void**)&l2b_idx_d, &p_llr, 316*384*sizeof(int), 1);
+	cudaMallocPitch((void**)&cnproc_start_idx_d, &p_llr, 316*384*sizeof(int), 1);
+	cudaMallocPitch((void**)&cnproc_end_idx_d, &p_llr, 316*384*sizeof(int), 1);
+	cudaMallocPitch((void**)&c2b_idx_d, &p_llr, 316*384*sizeof(int), 1);
+	cudaMallocPitch((void**)&bnproc_start_idx_d, &p_llr, 316*384*sizeof(int), 1);
+	cudaMallocPitch((void**)&bnproc_end_idx_d, &p_llr, 316*384*sizeof(int), 1);
+	cudaMallocPitch((void**)&b2c_idx_d, &p_llr, 316*384*sizeof(int), 1);
+	cudaMallocPitch((void**)&resbuf_d, &p_llr, 316*384*sizeof(int), 1);
+	cudaMallocPitch((void**)&llr_idx_d, &p_llr, 26113*sizeof(int), 1);
+
+	cudaMemcpyAsync((void*)llr_d, (const void*)llr, 68*384*sizeof(float), cudaMemcpyHostToDevice);	
+	cudaMemcpyAsync((void*)l2c_idx_d, (const void*)l2c_idx, 316*384*sizeof(int), cudaMemcpyHostToDevice);	
+	cudaMemcpyAsync((void*)l2b_idx_d, (const void*)l2b_idx, 316*384*sizeof(int), cudaMemcpyHostToDevice);	
+	cudaMemcpyAsync((void*)cnproc_start_idx_d, (const void*)cnproc_start_idx, 316*384*sizeof(int), cudaMemcpyHostToDevice);	
+	cudaMemcpyAsync((void*)cnproc_end_idx_d, (const void*)cnproc_end_idx, 316*384*sizeof(int), cudaMemcpyHostToDevice);	
+	cudaMemcpyAsync((void*)c2b_idx_d, (const void*)c2b_idx, 316*384*sizeof(int), cudaMemcpyHostToDevice);	
+	cudaMemcpyAsync((void*)bnproc_start_idx_d, (const void*)bnproc_start_idx, 316*384*sizeof(int), cudaMemcpyHostToDevice);	
+	cudaMemcpyAsync((void*)bnproc_end_idx_d, (const void*)bnproc_end_idx, 316*384*sizeof(int), cudaMemcpyHostToDevice);	
+	cudaMemcpyAsync((void*)b2c_idx_d, (const void*)b2c_idx, 316*384*sizeof(int), cudaMemcpyHostToDevice);	
+	cudaMemcpyAsync((void*)llr_idx_d, (const void*)llr_idx, 26113*sizeof(int), cudaMemcpyHostToDevice);	
+
+	cudaHostGetDevicePointer((void**)&decode_output_d, (void*)decode_output_h, 0);
+	cudaDeviceSynchronize();
+
+
+	printf("BG %d, Zc %d, code_length %d\n", BG, Zc, code_length);
+
+
+	cudaEvent_t start, end;
+	float time;
+
+	cudaEventCreate(&start);
+	cudaEventCreate(&end);
+	cudaEventRecord(start, 0);
+
+
+	llr2CN<<<blockNum, threadNum>>>(llr_d, cnbuf_d, l2c_idx_d);
+	llr2BN<<<blockNum, threadNum>>>(llr_d, const_llr_d, l2b_idx_d);
+
+/*
+	print_arr("debug/const_llr_d", const_llr_d, 26112);
+	print_arr("debug/cnbuf_d", cnbuf_d, 316*384);
+	print_arr("debug/const_llrbuf_d", const_llrbuf_d, 316*384);
+*/
+
+	char dir[] = "debug/", cn[] = "cnbuf", bn[] = "bnbuf", llrstr[] = "llrbuf_d";
+	char str[100] = {};
+	for(int i = 0; i < rounds; i++){
+		CNProcess<<<blockNum, threadNum>>>(cnbuf_d, bnbuf_d, b2c_idx_d, cnproc_start_idx_d, cnproc_end_idx_d);
+#ifdef debug
+		snprintf(str, 20, "%s%s_%d", dir, bn, i+1);
+		print_arr(str, bnbuf_d, 316*384);
+#endif
+		BNProcess<<<blockNum, threadNum>>>(const_llr_d, bnbuf_d, cnbuf_d, c2b_idx_d, bnproc_start_idx_d, bnproc_end_idx_d, resbuf_d);
+#ifdef debug
+		snprintf(str, 20, "%s%s_%d", dir, cn, i+1);
+		print_arr(str, cnbuf_d, 316*384);
+#endif
+		BN2llr<<<51, 512>>>(llr_d, bnbuf_d, llrbuf_d, llr_idx_d);
+#ifdef debug
+		snprintf(str, 20, "%s%s_%d", dir, llrstr, i+1);
+		print_arr(str, llrbuf_d, 26112);
+#endif
+	}
+
+	BitDetermination<<<33, 256>>>(llrbuf_d, decode_output_d);
+	cudaDeviceSynchronize();
+
+
+	cudaEventRecord(end, 0);
+	cudaEventSynchronize(end);
+	cudaEventElapsedTime(&time, start, end);
+	printf("time: %.6f ms\n", time);
+
+
+	int err = 0;
+	for(int i = 0; i < 8448/8; i++){
+		if(input[i] != decode_output_h[i]){
+//			printf("input[%d] :%d, decode_output[%d]: %d\n", i, input[i], i, decode_output_h[i]);
+			err++;
+		}
+	}
+	printf("err: %d\n", err);
+
+	free(input);
+	free(llr);
+	cudaFree(llr_d);
+	cudaFree(llrbuf_d);
+	cudaFree(bnbuf_d);
+	cudaFree(cnbuf_d);
+	cudaFree(l2c_idx_d);
+	cudaFree(cnproc_start_idx_d);
+	cudaFree(cnproc_end_idx_d);
+	cudaFree(c2b_idx_d);
+	cudaFree(bnproc_start_idx_d);
+	cudaFree(bnproc_end_idx_d);
+	cudaFree(b2c_idx_d);
+	cudaFree(const_llr_d);
+	cudaFree(llr_idx_d);
+	cudaFree(resbuf_d);
+
+	cudaFreeHost(decode_output_h);
+	return 0;
+}
--- a/openair1/PHY/CODING/GPU_LDPC/optimized_ldpc/test/test.cu
+++ b/openair1/PHY/CODING/GPU_LDPC/optimized_ldpc/test/test.cu
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <cuda_runtime.h>
+#include <cuda.h>
+#include "test.h"
+
+void print_arr_cpu(const char *file, int *arr, int size)
+{
+	FILE *fp = fopen(file, "w");
+	for(int i = 0; i < size; i++){
+		fprintf(fp, "%s[%d]: %d\n", file, i, arr[i]);
+	}
+	fclose(fp);
+}
+
+void print_arr(const char *file, int *arr, int size)
+{
+	int *tmp = (int*)malloc(sizeof(int)*size);
+	FILE *fp = fopen(file, "w");
+	cudaMemcpy((void*)tmp, (const void*)arr, size*sizeof(int), cudaMemcpyDeviceToHost);
+	cudaDeviceSynchronize();
+	for(int i = 0; i < size; i++){
+		fprintf(fp, "%s[%d]: %d\n", file, i, tmp[i]);
+	}
+	free(tmp);
+	fclose(fp);
+}
+
+__global__ void llr2CN(int *llr, int *cnbuf, int *l2c_idx)
+{
+	int tid = blockIdx.x*blockDim.x + threadIdx.x;
+
+	cnbuf[tid] = llr[l2c_idx[tid]];
+	__syncthreads();
+}
+
+__global__ void llr2BN(int *llr, int *const_llr, int *l2b_idx)
+{
+	int tid = blockIdx.x*blockDim.x + threadIdx.x;
+
+	const_llr[tid] = llr[l2b_idx[tid]];
+	__syncthreads();
+}
+
+__global__ void CNProcess(int *cnbuf, int *bnbuf, int *b2c_idx, int *cnproc_idx)
+{
+	int tid = blockIdx.x*blockDim.x + threadIdx.x;
+
+	int start = cnproc_idx[tid*2];
+	int end = cnproc_idx[tid*2+1];
+/*	
+	if (tid == 6){
+		printf("start %d, end %d\n", start, end);
+	}
+*/
+	int sgn = 1, val = INT32_MAX;
+	for(int i = start; i < end; i++){
+		if(i == tid)	continue;
+		
+		int tmp = cnbuf[i];
+//		if(tid == 6)	printf("tmp %d\n", tmp);
+		if(tmp < 0){
+			tmp = -tmp;
+			sgn = -sgn;
+		}
+		if(val > tmp){
+			val = tmp;
+//			if(tid == 6)printf("val = tmp\n");
+		}
+//		if(tid == 6)	printf("%d\n", val);
+	}
+/*	
+	if(tid == 6){
+		printf("b2c_idx[6] %d\n", b2c_idx[tid]);
+		printf("sgn*val = %d\n", sgn*val);
+		printf("\n\n");
+	}
+*/
+	bnbuf[b2c_idx[tid]] = sgn*val;// + const_llr[tid];
+	__syncthreads();
+}
+
+__global__ void BNProcess(int *const_llr, int *bnbuf, int *cnbuf, int *c2b_idx, int *bnproc_idx)
+{ 
+	int tid = blockIdx.x*blockDim.x + threadIdx.x;
+	
+	int start = bnproc_idx[tid*2];
+	int end = bnproc_idx[tid*2+1];
+	
+	int val = 0;
+	for(int i = start; i < end; i++){
+		if(i == tid)	continue;
+		val += bnbuf[i];
+	}
+	cnbuf[c2b_idx[tid]] = val + const_llr[tid];
+	__syncthreads();
+}
+
+__global__ void BN2llr(int *bnbuf, int *llrbuf, int *llr_idx)
+{
+	int tid = blockIdx.x*blockDim.x + threadIdx.x;
+
+	int start = llr_idx[tid];
+	int end = llr_idx[tid+1];
+
+	int res = 0;
+	for(int i = start; i < end; i++){
+		res += bnbuf[i];
+	}
+	llrbuf[tid] = res;
+	__syncthreads();
+}
+
+__global__ void BitDetermination(int *BN, unsigned int *decode_d)
+{
+	__shared__ int tmp[256];
+	int tid = blockIdx.x*256 + threadIdx.x;
+	int bid = threadIdx.x;
+	tmp[bid] = 0;
+	
+	
+	if(BN[tid] < 0)
+	{
+		tmp[bid] = 1 << (bid&7);
+	}
+
+	__syncthreads();
+	
+	if(threadIdx.x < 32)
+	{
+		decode_d[blockIdx.x*32 + threadIdx.x] = 0;
+		for(int i = 0; i < 8; i++)
+		{
+			decode_d[blockIdx.x*32 + threadIdx.x] += tmp[threadIdx.x*8+i];
+		}
+	}
+}
+
+
+int main(int argc, char **argv)
+{
+	int llr[15] = {-48, 27, -47, 13, 34, 
+				   -41, 51, 29, -6, -19,
+				   -45, -42, -40, -6, -33};
+	
+	int *llr_d, *llrbuf_d, *const_llr_d, *cnbuf_d, *bnbuf_d;
+	int *l2c_idx_d, *cnproc_idx_d, *c2b_idx_d, *bnproc_idx_d, *b2c_idx_d, *llr_idx_d, *l2b_idx_d;
+
+	int blockNum = 6, threadNum = 11;
+	int rounds = 5, Zc = 3;
+
+	size_t p_llr;
+
+	cudaMallocPitch((void**)&llr_d, &p_llr, 15*sizeof(int), 1);
+	cudaMallocPitch((void**)&llrbuf_d, &p_llr, 15*sizeof(int), 1);
+	cudaMallocPitch((void**)&const_llr_d, &p_llr, 33*sizeof(int), 1);
+	cudaMallocPitch((void**)&cnbuf_d, &p_llr, 33*sizeof(int), 1);
+	cudaMallocPitch((void**)&bnbuf_d, &p_llr, 33*sizeof(int), 1);
+	cudaMallocPitch((void**)&l2c_idx_d, &p_llr, 33*sizeof(int), 1);
+	cudaMallocPitch((void**)&l2b_idx_d, &p_llr, 33*sizeof(int), 1);
+	cudaMallocPitch((void**)&cnproc_idx_d, &p_llr, 33*2*sizeof(int), 1);
+	cudaMallocPitch((void**)&c2b_idx_d, &p_llr, 33*sizeof(int), 1);
+	cudaMallocPitch((void**)&bnproc_idx_d, &p_llr, 33*2*sizeof(int), 1);
+	cudaMallocPitch((void**)&b2c_idx_d, &p_llr, 33*sizeof(int), 1);
+	cudaMallocPitch((void**)&llr_idx_d, &p_llr, 16*sizeof(int), 1);
+
+	cudaMemcpyAsync((void*)llr_d, (const void*)llr, 15*sizeof(int), cudaMemcpyHostToDevice);	
+	cudaMemcpyAsync((void*)l2c_idx_d, (const void*)l2c_idx, 33*sizeof(int), cudaMemcpyHostToDevice);	
+	cudaMemcpyAsync((void*)l2b_idx_d, (const void*)l2b_idx, 33*sizeof(int), cudaMemcpyHostToDevice);	
+	cudaMemcpyAsync((void*)cnproc_idx_d, (const void*)cnproc_idx, 33*2*sizeof(int), cudaMemcpyHostToDevice);	
+	cudaMemcpyAsync((void*)c2b_idx_d, (const void*)c2b_idx, 33*sizeof(int), cudaMemcpyHostToDevice);	
+	cudaMemcpyAsync((void*)bnproc_idx_d, (const void*)bnproc_idx, 33*2*sizeof(int), cudaMemcpyHostToDevice);	
+	cudaMemcpyAsync((void*)b2c_idx_d, (const void*)b2c_idx, 33*sizeof(int), cudaMemcpyHostToDevice);	
+	cudaMemcpyAsync((void*)llr_idx_d, (const void*)llr_idx, 16*sizeof(int), cudaMemcpyHostToDevice);	
+
+	cudaDeviceSynchronize();
+
+
+
+	llr2CN<<<blockNum, threadNum>>>(llr_d, cnbuf_d, l2c_idx_d);
+	llr2CN<<<blockNum, threadNum>>>(llr_d, const_llr_d, l2b_idx_d);
+
+/*
+	print_arr("debug/const_llr_d", const_llr_d, 26112);
+	print_arr("debug/cnbuf_d", cnbuf_d, 316*384);
+	print_arr("debug/const_llrbuf_d", const_llrbuf_d, 316*384);
+*/
+
+	char debug[] = "debug/";
+	char cn[] = "cnbuf";
+	char bn[] = "bnbuf";
+	char llrstr[] = "llrbuf_d";
+	char str[100] = {};
+	for(int i = 0; i < rounds; i++){
+		CNProcess<<<blockNum, threadNum>>>(cnbuf_d, bnbuf_d, b2c_idx_d, cnproc_idx_d);
+		snprintf(str, 20, "%s%s_%d", debug, bn, i+1);
+		cudaDeviceSynchronize();
+		print_arr(str, bnbuf_d, 33);
+
+		BNProcess<<<blockNum, threadNum>>>(const_llr_d, bnbuf_d, cnbuf_d, c2b_idx_d, bnproc_idx_d);
+		snprintf(str, 20, "%s%s_%d", debug, cn, i+1);
+		print_arr(str, cnbuf_d, 33);
+
+		BN2llr<<<3,5>>>(bnbuf_d, llrbuf_d, llr_idx_d);
+		snprintf(str, 20, "%s%s_%d", debug, llrstr, i+1);
+		print_arr(str, llrbuf_d, 15);
+	}
+
+//	BitDetermination<<<33, 256>>>(llrbuf_d, decode_output_d);
+	cudaDeviceSynchronize();
+
+
+	cudaFree(llrbuf_d);
+	cudaFree(bnbuf_d);
+	cudaFree(cnbuf_d);
+	cudaFree(l2c_idx_d);
+	cudaFree(l2b_idx_d);
+	cudaFree(cnproc_idx_d);
+	cudaFree(c2b_idx_d);
+	cudaFree(bnproc_idx_d);
+	cudaFree(b2c_idx_d);
+	cudaFree(llr_d);
+	cudaFree(const_llr_d);
+	cudaFree(llr_idx_d);
+
+	return 0;
+}
--- a/openair1/PHY/CODING/GPU_LDPC/optimized_ldpc/test/test.h
+++ b/openair1/PHY/CODING/GPU_LDPC/optimized_ldpc/test/test.h
+int l2c_idx[33] = { 1, 6, 10, 2, 7, 11, 0, 8, 9, 5, 10, 13, 3, 11, 14, 4, 9, 12, 0, 8, 11, 1, 6, 9, 2, 7, 10, 5, 12, 3, 13, 4, 14};
+int c2b_idx[33] = { 6, 18, 0, 21, 3, 24, 12, 29, 15, 31, 9, 27, 1, 22, 4, 25, 7, 19, 8, 16, 23, 2, 10, 26, 5, 13, 20, 17, 28, 11, 30, 14, 32};
+int b2c_idx[33] = { 2, 12, 21, 4, 14, 24, 0, 16, 18, 10, 22, 29, 6, 25, 31, 8, 19, 27, 1, 17, 26, 3, 13, 20, 5, 15, 23, 11, 28, 7, 30, 9, 32};
+int cnproc_idx[66] = { 0, 3, 0, 3, 0, 3, 3, 6, 3, 6, 3, 6, 6, 9, 6, 9, 6, 9, 9, 12, 9, 12, 9, 12, 12, 15, 12, 15, 12, 15, 15, 18, 15, 18, 15, 18, 18, 21, 18, 21, 18, 21, 21, 24, 21, 24, 21, 24, 24, 27, 24, 27, 24, 27, 27, 29, 27, 29, 29, 31, 29, 31, 31, 33, 31, 33};
+int bnproc_idx[66] = { 0, 2, 0, 2, 2, 4, 2, 4, 4, 6, 4, 6, 6, 8, 6, 8, 8, 10, 8, 10, 10, 12, 10, 12, 12, 14, 12, 14, 14, 16, 14, 16, 16, 18, 16, 18, 18, 21, 18, 21, 18, 21, 21, 24, 21, 24, 21, 24, 24, 27, 24, 27, 24, 27, 27, 29, 27, 29, 29, 31, 29, 31, 31, 33, 31, 33};
+int llr_idx[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 21, 24, 27, 29, 31, 33};
+int l2b_idx[33] = { 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 13, 13, 14, 14};
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/1.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/1.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/10.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/10.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/100.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/100.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/11.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/11.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/12.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/12.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/13.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/13.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/14.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/14.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/15.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/15.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/16.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/16.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/17.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/17.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/18.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/18.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/19.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/19.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/2.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/2.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/20.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/20.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/21.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/21.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/22.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/22.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/23.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/23.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/24.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/24.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/25.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/25.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/26.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/26.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/27.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/27.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/28.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/28.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/29.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/29.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/3.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/3.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/30.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/30.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/31.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/31.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/32.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/32.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/33.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/33.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/34.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/34.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/35.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/35.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/36.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/36.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/37.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/37.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/38.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/38.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/39.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/39.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/4.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/4.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/40.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/40.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/41.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/41.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/42.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/42.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/43.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/43.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/44.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/44.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/45.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/45.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/46.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/46.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/47.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/47.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/48.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/48.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/49.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/49.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/5.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/5.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/50.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/50.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/51.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/51.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/52.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/52.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/53.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/53.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/54.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/54.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/55.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/55.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/56.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/56.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/57.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/57.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/58.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/58.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/59.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/59.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/6.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/6.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/60.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/60.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/61.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/61.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/62.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/62.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/63.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/63.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/64.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/64.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/65.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/65.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/66.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/66.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/67.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/67.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/68.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/68.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/69.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/69.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/7.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/7.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/70.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/70.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/71.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/71.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/72.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/72.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/73.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/73.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/74.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/74.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/75.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/75.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/76.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/76.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/77.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/77.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/78.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/78.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/79.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/79.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/8.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/8.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/80.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/80.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/81.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/81.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/82.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/82.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/83.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/83.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/84.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/84.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/85.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/85.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/86.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/86.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/87.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/87.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/88.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/88.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/89.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/89.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/9.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/9.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/90.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/90.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/91.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/91.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/92.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/92.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/93.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/93.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/94.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/94.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/95.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/95.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/96.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/96.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/97.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/97.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/98.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/98.txt
--- a/openair1/PHY/CODING/GPU_LDPC/test_input/8448/99.txt
+++ b/openair1/PHY/CODING/GPU_LDPC/test_input/8448/99.txt