/*
 * Licensed to the OpenAirInterface (OAI) Software Alliance under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The OpenAirInterface Software Alliance licenses this file to You under
 * the OAI Public License, Version 1.1  (the "License"); you may not use this file
 * except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.openairinterface.org/?page_id=698
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *-------------------------------------------------------------------------------
 * For more information about the OpenAirInterface (OAI) Software Alliance:
 *      contact@openairinterface.org
 */

/*!\file ldpc_encoder2.c
 * \brief Defines the optimized LDPC encoder
 * \author Florian Kaltenberger, Raymond Knopp, Kien le Trung (Eurecom)
 * \email openair_tech@eurecom.fr
 * \date 27-03-2018
 * \version 1.0
 * \note
 * \warning
 */

#include <stdlib.h>
#include <math.h>
#include <stdio.h>
#include <string.h>
#include <types.h>
#include "assertions.h"
#include "PHY/TOOLS/time_meas.h"
#include "defs.h"

//#define DEBUG_LDPC

#include "ldpc384_byte.c"
#include "ldpc352_byte.c"
#include "ldpc320_byte.c"
#include "ldpc288_byte.c"
#include "ldpc256_byte.c"
#include "ldpc240_byte.c"
#include "ldpc224_byte.c"
#include "ldpc208_byte.c"
#include "ldpc192_byte.c"
#include "ldpc176_byte.c"
#include "ldpc_BG2_Zc384_byte.c"
#include "ldpc_BG2_Zc352_byte.c"
#include "ldpc_BG2_Zc320_byte.c"
#include "ldpc_BG2_Zc288_byte.c"
#include "ldpc_BG2_Zc256_byte.c"
#include "ldpc_BG2_Zc240_byte.c"
#include "ldpc_BG2_Zc224_byte.c"
#include "ldpc_BG2_Zc208_byte.c"
#include "ldpc_BG2_Zc192_byte.c"
#include "ldpc_BG2_Zc176_byte.c"
#include "ldpc_BG2_Zc160_byte.c"
#include "ldpc_BG2_Zc144_byte.c"
#include "ldpc_BG2_Zc128_byte.c"
#include "ldpc_BG2_Zc120_byte.c"
#include "ldpc_BG2_Zc112_byte.c"
#include "ldpc_BG2_Zc104_byte.c"
#include "ldpc_BG2_Zc96_byte.c"
#include "ldpc_BG2_Zc88_byte.c"
#include "ldpc_BG2_Zc80_byte.c"
#include "ldpc_BG2_Zc72_byte.c"



void encode_parity_check_part_optim(uint8_t *c,uint8_t *d, short BG,short Zc,short Kb)
{

  if (BG==1)
  {
    switch (Zc)
    {
    case 2: break;
    case 3: break;
    case 4: break;
    case 5: break;
    case 6: break;
    case 7: break;
    case 8: break;
    case 9: break;
    case 10: break;
    case 11: break;
    case 12: break;
    case 13: break;
    case 14: break;
    case 15: break;
    case 16: break;
    case 18: break;
    case 20: break;
    case 22: break;
    case 24: break;      
    case 26: break;
    case 28: break;
    case 30: break;
    case 32: break;
    case 36: break;
    case 40: break;
    case 44: break;
    case 48: break;
    case 52: break;
    case 56: break;
    case 60: break;
    case 64: break;
    case 72: break;
    case 80: break;   
    case 88: break;   
    case 96: break;
    case 104: break;
    case 112: break;
    case 120: break;
    case 128: break;
    case 144: break;
    case 160: break;
    case 176: ldpc176_byte(c,d); break;
    case 192: ldpc192_byte(c,d); break;
    case 208: ldpc208_byte(c,d); break;
    case 224: ldpc224_byte(c,d); break;
    case 240: ldpc240_byte(c,d); break;
    case 256: ldpc256_byte(c,d); break;
    case 288: ldpc288_byte(c,d); break;
    case 320: ldpc320_byte(c,d); break;
    case 352: ldpc352_byte(c,d); break;
    case 384: ldpc384_byte(c,d); break;
    default: AssertFatal(0,"BG %d Zc %d is not supported yet\n",BG,Zc); break;
    }
  }
  else if (BG==2) {
    switch (Zc)
    {
    case 2: break;
    case 3: break;
    case 4: break;
    case 5: break;
    case 6: break;
    case 7: break;
    case 8: break;
    case 9: break;
    case 10: break;
    case 11: break;
    case 12: break;
    case 13: break;
    case 14: break;
    case 15: break;
    case 16: break;
    case 18: break;
    case 20: break;
    case 22: break;
    case 24: break;      
    case 26: break;
    case 28: break;
    case 30: break;
    case 32: break;
    case 36: break;
    case 40: break;
    case 44: break;
    case 48: break;
    case 52: break;
    case 56: break;
    case 60: break;
    case 64: break;
    case 72: ldpc_BG2_Zc72_byte(c,d); break;
    case 80: ldpc_BG2_Zc80_byte(c,d); break;   
    case 88: ldpc_BG2_Zc88_byte(c,d); break;   
    case 96: ldpc_BG2_Zc96_byte(c,d); break;
    case 104: ldpc_BG2_Zc104_byte(c,d); break;
    case 112: ldpc_BG2_Zc112_byte(c,d); break;
    case 120: ldpc_BG2_Zc120_byte(c,d); break;
    case 128: ldpc_BG2_Zc128_byte(c,d); break;
    case 144: ldpc_BG2_Zc144_byte(c,d); break;
    case 160: ldpc_BG2_Zc160_byte(c,d); break;
    case 176: ldpc_BG2_Zc176_byte(c,d); break;
    case 192: ldpc_BG2_Zc192_byte(c,d); break;
    case 208: ldpc_BG2_Zc208_byte(c,d); break;
    case 224: ldpc_BG2_Zc224_byte(c,d); break;
    case 240: ldpc_BG2_Zc240_byte(c,d); break;
    case 256: ldpc_BG2_Zc256_byte(c,d); break;
    case 288: ldpc_BG2_Zc288_byte(c,d); break;
    case 320: ldpc_BG2_Zc320_byte(c,d); break;
    case 352: ldpc_BG2_Zc352_byte(c,d); break;
    case 384: ldpc_BG2_Zc384_byte(c,d); break;
    default: AssertFatal(0,"BG %d Zc %d is not supported yet\n",BG,Zc); break;
    }
  }
  else {
    AssertFatal(0,"BG %d is not supported yet\n",BG);
  } 

}


int ldpc_encoder_optim(unsigned char *test_input,unsigned char *channel_input,short block_length,short BG,time_stats_t *tinput,time_stats_t *tprep,time_stats_t *tparity,time_stats_t *toutput)
{

  short Zc,Kb=0,nrows=0,ncols=0;
  int i,i1;
  int no_punctured_columns,removed_bit;

  //Table of possible lifting sizes
  short lift_size[51]= {2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18,20,22,24,26,28,30,32,36,40,44,48,52,56,60,64,72,80,88,96,104,112,120,128,144,160,176,192,208,224,240,256,288,320,352,384};

  int simd_size;

  //determine number of bits in codeword
   //if (block_length>3840)
   if (BG==1)
     {
       //BG=1;
       Kb = 22;
       nrows=46; //parity check bits
       ncols=22; //info bits
     }
     //else if (block_length<=3840)
    else if	(BG==2)
     {
       //BG=2;
       nrows=42; //parity check bits
       ncols=10; // info bits

       if (block_length>640)
 	Kb = 10;
       else if (block_length>560)
 	Kb = 9;
       else if (block_length>192)
       Kb = 8;
     else
       Kb = 6;
       }

  //find minimum value in all sets of lifting size
  Zc=0;
  for (i1=0; i1 < 51; i1++)
  {
    if (lift_size[i1] >= (double) block_length/Kb)
    {
      Zc = lift_size[i1];
      //printf("%d\n",Zc);
      break;
    }
  }
  AssertFatal(Zc>0,"no valid Zc found for block length %d\n",block_length);

#ifdef DEBUG_LDPC
  LOG_D(PHY,"ldpc_encoder_optim_8seg: BG %d, Zc %d, Kb %d, block_length %d\n",BG,Zc,Kb,block_length);
  LOG_D(PHY,"ldpc_encoder_optim_8seg: PDU %x %x %x %x\n",test_input[0],test_input[1],test_input[2],test_input[3]);
#endif

  if ((Zc&31) > 0) simd_size = 16;
  else simd_size = 32;

  unsigned char c[22*Zc] __attribute__((aligned(32))); //padded input, unpacked, max size
  unsigned char d[46*Zc] __attribute__((aligned(32))); //coded parity part output, unpacked, max size

  unsigned char c_extension[2*22*Zc*simd_size] __attribute__((aligned(32)));      //double size matrix of c

  // calculate number of punctured bits
  no_punctured_columns=(int)((nrows-2)*Zc+block_length-block_length*3)/Zc;
  removed_bit=(nrows-no_punctured_columns-2) * Zc+block_length-(int)(block_length*3);
  // printf("%d\n",no_punctured_columns);
  // printf("%d\n",removed_bit);
  // unpack input
  memset(c,0,sizeof(unsigned char) * ncols * Zc);
  memset(d,0,sizeof(unsigned char) * nrows * Zc);

  if(tinput != NULL) start_meas(tinput);
  for (i=0; i<block_length; i++) {
      c[i] = (test_input[i/8]&(1<<(i&7)))>>(i&7);
      //printf("c(%d,%d)=%d\n",j,i,temp);
    }

  if(tinput != NULL) stop_meas(tinput);

  if ((BG==1 && Zc>176) || (BG==2 && Zc>64)) { 
    // extend matrix
    if(tprep != NULL) start_meas(tprep);
    for (i1=0; i1 < ncols; i1++)
      {
	memcpy(&c_extension[2*i1*Zc], &c[i1*Zc], Zc*sizeof(unsigned char));
	memcpy(&c_extension[(2*i1+1)*Zc], &c[i1*Zc], Zc*sizeof(unsigned char));
      }
    for (i1=1;i1<simd_size;i1++) {
      memcpy(&c_extension[(2*ncols*Zc*i1)], &c_extension[i1], (2*ncols*Zc*sizeof(unsigned char))-i1);
      //    memset(&c_extension[(2*ncols*Zc*i1)],0,i1);
      /*
	printf("shift %d: ",i1);
	for (int j=0;j<64;j++) printf("%d ",c_extension[(2*ncols*Zc*i1)+j]);
	printf("\n");
      */
    }
    if(tprep != NULL) stop_meas(tprep);
    //parity check part
    if(tparity != NULL) start_meas(tparity);
    encode_parity_check_part_optim(c_extension, d, BG, Zc, Kb);
    if(tparity != NULL) stop_meas(tparity);
  }
  else {
    if (encode_parity_check_part_orig(c, d, BG, Zc, Kb, block_length)!=0) {
      printf("Problem with encoder\n");
      return(-1);
    }
  }
  if(toutput != NULL) start_meas(toutput);
  // information part and puncture columns
  memcpy(&channel_input[0], &c[2*Zc], (block_length-2*Zc)*sizeof(unsigned char));
  memcpy(&channel_input[block_length-2*Zc], &d[0], ((nrows-no_punctured_columns) * Zc-removed_bit)*sizeof(unsigned char));

  if(toutput != NULL) stop_meas(toutput);
  return 0;
}

int ldpc_encoder_optim_8seg(unsigned char **test_input,unsigned char **channel_input,short block_length,short BG,int n_segments,time_stats_t *tinput,time_stats_t *tprep,time_stats_t *tparity,time_stats_t *toutput)
{

  short Zc,Kb=0,nrows=0,ncols=0;
  int i,i1,j;
  int no_punctured_columns,removed_bit;
  //Table of possible lifting sizes
  short lift_size[51]= {2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18,20,22,24,26,28,30,32,36,40,44,48,52,56,60,64,72,80,88,96,104,112,120,128,144,160,176,192,208,224,240,256,288,320,352,384};
  char temp;
  int simd_size;

#ifdef __AVX2__
  __m256i shufmask = _mm256_set_epi64x(0x0303030303030303, 0x0202020202020202,0x0101010101010101, 0x0000000000000000);
  __m256i andmask  = _mm256_set1_epi64x(0x8040201008040201);  // every 8 bits -> 8 bytes, pattern repeats.
  __m256i zero256   = _mm256_setzero_si256();
  __m256i masks[8];
  register __m256i c256;
  masks[0] = _mm256_set1_epi8(0x1);
  masks[1] = _mm256_set1_epi8(0x2);
  masks[2] = _mm256_set1_epi8(0x4);
  masks[3] = _mm256_set1_epi8(0x8);
  masks[4] = _mm256_set1_epi8(0x10);
  masks[5] = _mm256_set1_epi8(0x20);
  masks[6] = _mm256_set1_epi8(0x40);
  masks[7] = _mm256_set1_epi8(0x80);
#endif

  AssertFatal(n_segments>0&&n_segments<=8,"0 < n_segments %d <= 8\n",n_segments);

  //determine number of bits in codeword
  //if (block_length>3840)
  if (BG==1)
    {
      //BG=1;
      Kb = 22;
      nrows=46; //parity check bits
      ncols=22; //info bits
    }
    //else if (block_length<=3840)
   else if	(BG==2)
    {
      //BG=2;
      nrows=42; //parity check bits
      ncols=10; // info bits
      
      if (block_length>640)
	Kb = 10;
      else if (block_length>560)
	Kb = 9;
      else if (block_length>192)
      Kb = 8;
    else
      Kb = 6;
      }

  //find minimum value in all sets of lifting size
  Zc=0;
  for (i1=0; i1 < 51; i1++)
  {
    if (lift_size[i1] >= (double) block_length/Kb)
    {
      Zc = lift_size[i1];
      //printf("%d\n",Zc);
      break;
    }
  }

#ifdef DEBUG_LDPC
  LOG_D(PHY,"ldpc_encoder_optim_8seg: BG %d, Zc %d, Kb %d, block_length %d, segments %d\n",BG,Zc,Kb,block_length,n_segments);
  LOG_D(PHY,"ldpc_encoder_optim_8seg: PDU (seg 0) %x %x %x %x\n",test_input[0][0],test_input[0][1],test_input[0][2],test_input[0][3]);
#endif

  AssertFatal(Zc>0,"no valid Zc found for block length %d\n",block_length);

  if ((Zc&31) > 0) simd_size = 16;
  else          simd_size = 32;

  unsigned char c[22*Zc] __attribute__((aligned(32))); //padded input, unpacked, max size
  unsigned char d[46*Zc] __attribute__((aligned(32))); //coded parity part output, unpacked, max size

  unsigned char c_extension[2*22*Zc*simd_size] __attribute__((aligned(32)));      //double size matrix of c

  // calculate number of punctured bits
  no_punctured_columns=(int)((nrows-2)*Zc+block_length-block_length*3)/Zc;
  removed_bit=(nrows-no_punctured_columns-2) * Zc+block_length-(int)(block_length*3);
  // printf("%d\n",no_punctured_columns);
  // printf("%d\n",removed_bit);
  // unpack input
  memset(c,0,sizeof(unsigned char) * ncols * Zc);
  memset(d,0,sizeof(unsigned char) * nrows * Zc);

  if(tinput != NULL) start_meas(tinput);
#if 0
  for (i=0; i<block_length; i++) {
    for (j=0; j<n_segments; j++) {

      temp = (test_input[j][i/8]&(1<<(i&7)))>>(i&7);
      //printf("c(%d,%d)=%d\n",j,i,temp);
      c[i] |= (temp << j);
    }
  }
#else
#ifdef __AVX2__
  for (i=0; i<block_length>>5; i++) {
    c256 = _mm256_and_si256(_mm256_cmpeq_epi8(_mm256_andnot_si256(_mm256_shuffle_epi8(_mm256_set1_epi32(((uint32_t*)test_input[0])[i]), shufmask),andmask),zero256),masks[0]);
    for (j=1; j<n_segments; j++) {
      c256 = _mm256_or_si256(_mm256_and_si256(_mm256_cmpeq_epi8(_mm256_andnot_si256(_mm256_shuffle_epi8(_mm256_set1_epi32(((uint32_t*)test_input[j])[i]), shufmask),andmask),zero256),masks[j]),c256);
    }
    ((__m256i *)c)[i] = c256;
  }

  for (i=(block_length>>5)<<5;i<block_length;i++) {
    for (j=0; j<n_segments; j++) {

      temp = (test_input[j][i/8]&(1<<(i&7)))>>(i&7);
      //printf("c(%d,%d)=%d\n",j,i,temp);
      c[i] |= (temp << j);
    }
  }
#else
  AssertFatal(1==0,"Need AVX2 for this\n");
#endif
#endif

  if(tinput != NULL) stop_meas(tinput);

  if ((BG==1 && Zc>176) || (BG==2 && Zc>64)) { 
    // extend matrix
    if(tprep != NULL) start_meas(tprep);
    for (i1=0; i1 < ncols; i1++)
      {
	memcpy(&c_extension[2*i1*Zc], &c[i1*Zc], Zc*sizeof(unsigned char));
	memcpy(&c_extension[(2*i1+1)*Zc], &c[i1*Zc], Zc*sizeof(unsigned char));
      }
    for (i1=1;i1<simd_size;i1++) {
      memcpy(&c_extension[(2*ncols*Zc*i1)], &c_extension[i1], (2*ncols*Zc*sizeof(unsigned char))-i1);
      //    memset(&c_extension[(2*ncols*Zc*i1)],0,i1);
      /*
	printf("shift %d: ",i1);
	for (int j=0;j<64;j++) printf("%d ",c_extension[(2*ncols*Zc*i1)+j]);
	printf("\n");
      */
    }
    if(tprep != NULL) stop_meas(tprep);
    //parity check part
    if(tparity != NULL) start_meas(tparity);
    encode_parity_check_part_optim(c_extension, d, BG, Zc, Kb);
    if(tparity != NULL) stop_meas(tparity);
  }
  else {
    if (encode_parity_check_part_orig(c, d, BG, Zc, Kb, block_length)!=0) {
      printf("Problem with encoder\n");
      return(-1);
    }
  }
  if(toutput != NULL) start_meas(toutput);
  // information part and puncture columns
  /*
  memcpy(&channel_input[0], &c[2*Zc], (block_length-2*Zc)*sizeof(unsigned char));
  memcpy(&channel_input[block_length-2*Zc], &d[0], ((nrows-no_punctured_columns) * Zc-removed_bit)*sizeof(unsigned char));
  */
#ifdef __AVX2__
  if ((((2*Zc)&31) == 0) && (((block_length-(2*Zc))&31) == 0)) {
    //AssertFatal(((2*Zc)&31) == 0,"2*Zc needs to be a multiple of 32 for now\n");
    //AssertFatal(((block_length-(2*Zc))&31) == 0,"block_length-(2*Zc) needs to be a multiple of 32 for now\n");
    uint32_t l1 = (block_length-(2*Zc))>>5;
    uint32_t l2 = ((nrows-no_punctured_columns) * Zc-removed_bit)>>5;
    __m256i *c256p = (__m256i *)&c[2*Zc];
    __m256i *d256p = (__m256i *)&d[0];
    //  if (((block_length-(2*Zc))&31)>0) l1++;
    
    for (i=0;i<l1;i++)
      for (j=0;j<n_segments;j++) ((__m256i *)channel_input[j])[i] = _mm256_and_si256(_mm256_srai_epi16(c256p[i],j),masks[0]);
    
    //  if ((((nrows-no_punctured_columns) * Zc-removed_bit)&31)>0) l2++;
    
    for (i1=0;i1<l2;i1++,i++)
      for (j=0;j<n_segments;j++) ((__m256i *)channel_input[j])[i] = _mm256_and_si256(_mm256_srai_epi16(d256p[i1],j),masks[0]);
  }
  else {
#ifdef DEBUG_LDPC
  LOG_W(PHY,"using non-optimized version\n");
#endif
    // do non-SIMD version
    for (i=0;i<(block_length-2*Zc);i++) 
      for (j=0; j<n_segments; j++)
	channel_input[j][i] = (c[2*Zc+i]>>j)&1;
    for (i=0;i<((nrows-no_punctured_columns) * Zc-removed_bit);i++)
      for (j=0; j<n_segments; j++)
	channel_input[j][block_length-2*Zc+i] = (d[i]>>j)&1;
    }

#else
    AssertFatal(1==0,"Need AVX2 for now\n");
#endif

  if(toutput != NULL) stop_meas(toutput);
  return 0;
}

int ldpc_encoder_optim_8seg_multi(unsigned char **test_input,unsigned char **channel_input,short block_length, short BG, int n_segments,unsigned int macro_num, time_stats_t *tinput,time_stats_t *tprep,time_stats_t *tparity,time_stats_t *toutput)
{

  short Zc,Kb=0,nrows=0,ncols=0;
  int i,i1,j;
  int no_punctured_columns,removed_bit;
  //Table of possible lifting sizes
  short lift_size[51]= {2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18,20,22,24,26,28,30,32,36,40,44,48,52,56,60,64,72,80,88,96,104,112,120,128,144,160,176,192,208,224,240,256,288,320,352,384};
  char temp;
  int simd_size;
  int macro_segment, macro_segment_end;


  macro_segment = 8*macro_num;
  macro_segment_end = (n_segments > 8*(macro_num+1)) ? 8*(macro_num+1) : n_segments;
  ///printf("macro_segment: %d\n", macro_segment);
  ///printf("macro_segment_end: %d\n", macro_segment_end );

#ifdef __AVX2__
  __m256i shufmask = _mm256_set_epi64x(0x0303030303030303, 0x0202020202020202,0x0101010101010101, 0x0000000000000000);
  __m256i andmask  = _mm256_set1_epi64x(0x8040201008040201);  // every 8 bits -> 8 bytes, pattern repeats.
  __m256i zero256   = _mm256_setzero_si256();
  __m256i masks[8];
  register __m256i c256;
  masks[0] = _mm256_set1_epi8(0x1);
  masks[1] = _mm256_set1_epi8(0x2);
  masks[2] = _mm256_set1_epi8(0x4);
  masks[3] = _mm256_set1_epi8(0x8);
  masks[4] = _mm256_set1_epi8(0x10);
  masks[5] = _mm256_set1_epi8(0x20);
  masks[6] = _mm256_set1_epi8(0x40);
  masks[7] = _mm256_set1_epi8(0x80);
#endif

  ///AssertFatal(n_segments>0&&n_segments<=8,"0 < n_segments %d <= 8\n",n_segments);

  //determine number of bits in codeword
  if (BG==1)
    {
      BG=1;
      Kb = 22;
      nrows=46; //parity check bits
      ncols=22; //info bits
    }
    else if (BG==2)
    {
      BG=2;
      nrows=42; //parity check bits
      ncols=10; // info bits

      if (block_length>640)
	Kb = 10;
      else if (block_length>560)
	Kb = 9;
      else if (block_length>192)
      Kb = 8;
    else
      Kb = 6;
      }

  //find minimum value in all sets of lifting size
  Zc=0;
  for (i1=0; i1 < 51; i1++)
  {
    if (lift_size[i1] >= (double) block_length/Kb)
    {
      Zc = lift_size[i1];
      //printf("%d\n",Zc);
      break;
    }
  }

#ifdef DEBUG_LDPC
  LOG_D(PHY,"ldpc_encoder_optim_8seg: BG %d, Zc %d, Kb %d, block_length %d, segments %d\n",BG,Zc,Kb,block_length,n_segments);
  LOG_D(PHY,"ldpc_encoder_optim_8seg: PDU (seg 0) %x %x %x %x\n",test_input[0][0],test_input[0][1],test_input[0][2],test_input[0][3]);
#endif

  AssertFatal(Zc>0,"no valid Zc found for block length %d\n",block_length);

  if ((Zc&31) > 0) simd_size = 16;
  else          simd_size = 32;

  unsigned char c[22*Zc] __attribute__((aligned(32))); //padded input, unpacked, max size
  unsigned char d[46*Zc] __attribute__((aligned(32))); //coded parity part output, unpacked, max size

  unsigned char c_extension[2*22*Zc*simd_size] __attribute__((aligned(32)));      //double size matrix of c

  // calculate number of punctured bits
  no_punctured_columns=(int)((nrows-2)*Zc+block_length-block_length*3)/Zc;
  removed_bit=(nrows-no_punctured_columns-2) * Zc+block_length-(int)(block_length*3);
  // printf("%d\n",no_punctured_columns);
  // printf("%d\n",removed_bit);
  // unpack input
  memset(c,0,sizeof(unsigned char) * ncols * Zc);
  memset(d,0,sizeof(unsigned char) * nrows * Zc);

  if(tinput != NULL) start_meas(tinput);
#if 0
  for (i=0; i<block_length; i++) {
	//for (j=0; j<n_segments; j++) {
    for (j=macro_segment; j < macro_segment_end; j++) {

      temp = (test_input[j][i/8]&(1<<(i&7)))>>(i&7);
      //printf("c(%d,%d)=%d\n",j,i,temp);
      c[i] |= (temp << (j-macro_segment));
    }
  }
#else
#ifdef __AVX2__
  for (i=0; i<block_length>>5; i++) {
    c256 = _mm256_and_si256(_mm256_cmpeq_epi8(_mm256_andnot_si256(_mm256_shuffle_epi8(_mm256_set1_epi32(((uint32_t*)test_input[macro_segment])[i]), shufmask),andmask),zero256),masks[0]);
    //for (j=1; j<n_segments; j++) {
    for (j=macro_segment+1; j < macro_segment_end; j++) {
      c256 = _mm256_or_si256(_mm256_and_si256(_mm256_cmpeq_epi8(_mm256_andnot_si256(_mm256_shuffle_epi8(_mm256_set1_epi32(((uint32_t*)test_input[j])[i]), shufmask),andmask),zero256),masks[j-macro_segment]),c256);
    }
    ((__m256i *)c)[i] = c256;
  }

  for (i=(block_length>>5)<<5;i<block_length;i++) {
    //for (j=0; j<n_segments; j++) {
	  for (j=macro_segment; j < macro_segment_end; j++) {

      temp = (test_input[j][i/8]&(1<<(i&7)))>>(i&7);
      //printf("c(%d,%d)=%d\n",j,i,temp);
      c[i] |= (temp << (j-macro_segment));
    }
  }
#else
  AssertFatal(1==0,"Need AVX2 for this\n");
#endif
#endif

  if(tinput != NULL) stop_meas(tinput);

  if ((BG==1 && Zc>176) || (BG==2 && Zc>64)) {
    // extend matrix
    if(tprep != NULL) start_meas(tprep);
    for (i1=0; i1 < ncols; i1++)
      {
	memcpy(&c_extension[2*i1*Zc], &c[i1*Zc], Zc*sizeof(unsigned char));
	memcpy(&c_extension[(2*i1+1)*Zc], &c[i1*Zc], Zc*sizeof(unsigned char));
      }
    for (i1=1;i1<simd_size;i1++) {
      memcpy(&c_extension[(2*ncols*Zc*i1)], &c_extension[i1], (2*ncols*Zc*sizeof(unsigned char))-i1);
      //    memset(&c_extension[(2*ncols*Zc*i1)],0,i1);
      /*
	printf("shift %d: ",i1);
	for (int j=0;j<64;j++) printf("%d ",c_extension[(2*ncols*Zc*i1)+j]);
	printf("\n");
      */
    }
    if(tprep != NULL) stop_meas(tprep);
    //parity check part
    if(tparity != NULL) start_meas(tparity);
    encode_parity_check_part_optim(c_extension, d, BG, Zc, Kb);
    if(tparity != NULL) stop_meas(tparity);
  }
  else {
    if (encode_parity_check_part_orig(c, d, BG, Zc, Kb, block_length)!=0) {
      printf("Problem with encoder\n");
      return(-1);
    }
  }
  if(toutput != NULL) start_meas(toutput);
  // information part and puncture columns
  /*
  memcpy(&channel_input[0], &c[2*Zc], (block_length-2*Zc)*sizeof(unsigned char));
  memcpy(&channel_input[block_length-2*Zc], &d[0], ((nrows-no_punctured_columns) * Zc-removed_bit)*sizeof(unsigned char));
  */
#ifdef __AVX2__
  if ((((2*Zc)&31) == 0) && (((block_length-(2*Zc))&31) == 0)) {
    //AssertFatal(((2*Zc)&31) == 0,"2*Zc needs to be a multiple of 32 for now\n");
    //AssertFatal(((block_length-(2*Zc))&31) == 0,"block_length-(2*Zc) needs to be a multiple of 32 for now\n");
    uint32_t l1 = (block_length-(2*Zc))>>5;
    uint32_t l2 = ((nrows-no_punctured_columns) * Zc-removed_bit)>>5;
    __m256i *c256p = (__m256i *)&c[2*Zc];
    __m256i *d256p = (__m256i *)&d[0];
    //  if (((block_length-(2*Zc))&31)>0) l1++;

    for (i=0;i<l1;i++)
      //for (j=0;j<n_segments;j++) ((__m256i *)channel_input[j])[i] = _mm256_and_si256(_mm256_srai_epi16(c256p[i],j),masks[0]);
    	for (j=macro_segment; j < macro_segment_end; j++) ((__m256i *)channel_input[j])[i] = _mm256_and_si256(_mm256_srai_epi16(c256p[i],j-macro_segment),masks[0]);


    //  if ((((nrows-no_punctured_columns) * Zc-removed_bit)&31)>0) l2++;

    for (i1=0;i1<l2;i1++,i++)
      //for (j=0;j<n_segments;j++) ((__m256i *)channel_input[j])[i] = _mm256_and_si256(_mm256_srai_epi16(d256p[i1],j),masks[0]);
    	for (j=macro_segment; j < macro_segment_end; j++)  ((__m256i *)channel_input[j])[i] = _mm256_and_si256(_mm256_srai_epi16(d256p[i1],j-macro_segment),masks[0]);
  }
  else {
#ifdef DEBUG_LDPC
  LOG_W(PHY,"using non-optimized version\n");
#endif
    // do non-SIMD version
    for (i=0;i<(block_length-2*Zc);i++)
      //for (j=0; j<n_segments; j++)
      for (j=macro_segment; j < macro_segment_end; j++)
	channel_input[j][i] = (c[2*Zc+i]>>(j-macro_segment))&1;
    for (i=0;i<((nrows-no_punctured_columns) * Zc-removed_bit);i++)
      //for (j=0; j<n_segments; j++)
    	  for (j=macro_segment; j < macro_segment_end; j++)
	channel_input[j][block_length-2*Zc+i] = (d[i]>>(j-macro_segment))&1;
    }

#else
    AssertFatal(1==0,"Need AVX2 for now\n");
#endif

  if(toutput != NULL) stop_meas(toutput);
  return 0;
}