/*
 * Licensed to the OpenAirInterface (OAI) Software Alliance under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The OpenAirInterface Software Alliance licenses this file to You under
 * the OAI Public License, Version 1.1  (the "License"); you may not use this file
 * except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.openairinterface.org/?page_id=698
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *-------------------------------------------------------------------------------
 * For more information about the OpenAirInterface (OAI) Software Alliance:
 *      contact@openairinterface.org
 */

/*! \file PHY/LTE_TRANSPORT/dlsch_llr_computation.c
 * \brief Top-level routines for LLR computation of the PDSCH physical channel from 36-211, V8.6 2009-03
 * \author R. Knopp, F. Kaltenberger,A. Bhamri, S. Aubert, S. Wagner, X Jiang
 * \date 2011
 * \version 0.1
 * \company Eurecom
 * \email: knopp@eurecom.fr,florian.kaltenberger@eurecom.fr,ankit.bhamri@eurecom.fr,sebastien.aubert@eurecom.fr, sebastian.wagner@eurecom.fr
 * \note
 * \warning
 */

#include "PHY/defs.h"
#include "PHY/TOOLS/defs.h"
#include "PHY/extern.h"
#include "defs.h"
#include "extern.h"
#include "PHY/sse_intrin.h"

int16_t ones256[16] __attribute__ ((aligned(32))) = {0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff};

static __m256i rho_rpi __attribute__ ((aligned(32)));
static __m256i rho_rmi __attribute__ ((aligned(32)));
static __m256i rho_rpi_1_1 __attribute__ ((aligned(32)));
static __m256i rho_rpi_1_3 __attribute__ ((aligned(32)));
static __m256i rho_rpi_1_5 __attribute__ ((aligned(32)));
static __m256i rho_rpi_1_7 __attribute__ ((aligned(32)));
static __m256i rho_rpi_3_1 __attribute__ ((aligned(32)));
static __m256i rho_rpi_3_3 __attribute__ ((aligned(32)));
static __m256i rho_rpi_3_5 __attribute__ ((aligned(32)));
static __m256i rho_rpi_3_7 __attribute__ ((aligned(32)));
static __m256i rho_rpi_5_1 __attribute__ ((aligned(32)));
static __m256i rho_rpi_5_3 __attribute__ ((aligned(32)));
static __m256i rho_rpi_5_5 __attribute__ ((aligned(32)));
static __m256i rho_rpi_5_7 __attribute__ ((aligned(32)));
static __m256i rho_rpi_7_1 __attribute__ ((aligned(32)));
static __m256i rho_rpi_7_3 __attribute__ ((aligned(32)));
static __m256i rho_rpi_7_5 __attribute__ ((aligned(32)));
static __m256i rho_rpi_7_7 __attribute__ ((aligned(32)));
static __m256i rho_rmi_1_1 __attribute__ ((aligned(32)));
static __m256i rho_rmi_1_3 __attribute__ ((aligned(32)));
static __m256i rho_rmi_1_5 __attribute__ ((aligned(32)));
static __m256i rho_rmi_1_7 __attribute__ ((aligned(32)));
static __m256i rho_rmi_3_1 __attribute__ ((aligned(32)));
static __m256i rho_rmi_3_3 __attribute__ ((aligned(32)));
static __m256i rho_rmi_3_5 __attribute__ ((aligned(32)));
static __m256i rho_rmi_3_7 __attribute__ ((aligned(32)));
static __m256i rho_rmi_5_1 __attribute__ ((aligned(32)));
static __m256i rho_rmi_5_3 __attribute__ ((aligned(32)));
static __m256i rho_rmi_5_5 __attribute__ ((aligned(32)));
static __m256i rho_rmi_5_7 __attribute__ ((aligned(32)));
static __m256i rho_rmi_7_1 __attribute__ ((aligned(32)));
static __m256i rho_rmi_7_3 __attribute__ ((aligned(32)));
static __m256i rho_rmi_7_5 __attribute__ ((aligned(32)));
static __m256i rho_rmi_7_7 __attribute__ ((aligned(32)));

static __m256i psi_r_m7_m7 __attribute__ ((aligned(32)));
static __m256i psi_r_m7_m5 __attribute__ ((aligned(32)));
static __m256i psi_r_m7_m3 __attribute__ ((aligned(32)));
static __m256i psi_r_m7_m1 __attribute__ ((aligned(32)));
static __m256i psi_r_m7_p1 __attribute__ ((aligned(32)));
static __m256i psi_r_m7_p3 __attribute__ ((aligned(32)));
static __m256i psi_r_m7_p5 __attribute__ ((aligned(32)));
static __m256i psi_r_m7_p7 __attribute__ ((aligned(32)));
static __m256i psi_r_m5_m7 __attribute__ ((aligned(32)));
static __m256i psi_r_m5_m5 __attribute__ ((aligned(32)));
static __m256i psi_r_m5_m3 __attribute__ ((aligned(32)));
static __m256i psi_r_m5_m1 __attribute__ ((aligned(32)));
static __m256i psi_r_m5_p1 __attribute__ ((aligned(32)));
static __m256i psi_r_m5_p3 __attribute__ ((aligned(32)));
static __m256i psi_r_m5_p5 __attribute__ ((aligned(32)));
static __m256i psi_r_m5_p7 __attribute__ ((aligned(32)));
static __m256i psi_r_m3_m7 __attribute__ ((aligned(32)));
static __m256i psi_r_m3_m5 __attribute__ ((aligned(32)));
static __m256i psi_r_m3_m3 __attribute__ ((aligned(32)));
static __m256i psi_r_m3_m1 __attribute__ ((aligned(32)));
static __m256i psi_r_m3_p1 __attribute__ ((aligned(32)));
static __m256i psi_r_m3_p3 __attribute__ ((aligned(32)));
static __m256i psi_r_m3_p5 __attribute__ ((aligned(32)));
static __m256i psi_r_m3_p7 __attribute__ ((aligned(32)));
static __m256i psi_r_m1_m7 __attribute__ ((aligned(32)));
static __m256i psi_r_m1_m5 __attribute__ ((aligned(32)));
static __m256i psi_r_m1_m3 __attribute__ ((aligned(32)));
static __m256i psi_r_m1_m1 __attribute__ ((aligned(32)));
static __m256i psi_r_m1_p1 __attribute__ ((aligned(32)));
static __m256i psi_r_m1_p3 __attribute__ ((aligned(32)));
static __m256i psi_r_m1_p5 __attribute__ ((aligned(32)));
static __m256i psi_r_m1_p7 __attribute__ ((aligned(32)));
static __m256i psi_r_p1_m7 __attribute__ ((aligned(32)));
static __m256i psi_r_p1_m5 __attribute__ ((aligned(32)));
static __m256i psi_r_p1_m3 __attribute__ ((aligned(32)));
static __m256i psi_r_p1_m1 __attribute__ ((aligned(32)));
static __m256i psi_r_p1_p1 __attribute__ ((aligned(32)));
static __m256i psi_r_p1_p3 __attribute__ ((aligned(32)));
static __m256i psi_r_p1_p5 __attribute__ ((aligned(32)));
static __m256i psi_r_p1_p7 __attribute__ ((aligned(32)));
static __m256i psi_r_p3_m7 __attribute__ ((aligned(32)));
static __m256i psi_r_p3_m5 __attribute__ ((aligned(32)));
static __m256i psi_r_p3_m3 __attribute__ ((aligned(32)));
static __m256i psi_r_p3_m1 __attribute__ ((aligned(32)));
static __m256i psi_r_p3_p1 __attribute__ ((aligned(32)));
static __m256i psi_r_p3_p3 __attribute__ ((aligned(32)));
static __m256i psi_r_p3_p5 __attribute__ ((aligned(32)));
static __m256i psi_r_p3_p7 __attribute__ ((aligned(32)));
static __m256i psi_r_p5_m7 __attribute__ ((aligned(32)));
static __m256i psi_r_p5_m5 __attribute__ ((aligned(32)));
static __m256i psi_r_p5_m3 __attribute__ ((aligned(32)));
static __m256i psi_r_p5_m1 __attribute__ ((aligned(32)));
static __m256i psi_r_p5_p1 __attribute__ ((aligned(32)));
static __m256i psi_r_p5_p3 __attribute__ ((aligned(32)));
static __m256i psi_r_p5_p5 __attribute__ ((aligned(32)));
static __m256i psi_r_p5_p7 __attribute__ ((aligned(32)));
static __m256i psi_r_p7_m7 __attribute__ ((aligned(32)));
static __m256i psi_r_p7_m5 __attribute__ ((aligned(32)));
static __m256i psi_r_p7_m3 __attribute__ ((aligned(32)));
static __m256i psi_r_p7_m1 __attribute__ ((aligned(32)));
static __m256i psi_r_p7_p1 __attribute__ ((aligned(32)));
static __m256i psi_r_p7_p3 __attribute__ ((aligned(32)));
static __m256i psi_r_p7_p5 __attribute__ ((aligned(32)));
static __m256i psi_r_p7_p7 __attribute__ ((aligned(32)));

static __m256i psi_i_m7_m7 __attribute__ ((aligned(32)));
static __m256i psi_i_m7_m5 __attribute__ ((aligned(32)));
static __m256i psi_i_m7_m3 __attribute__ ((aligned(32)));
static __m256i psi_i_m7_m1 __attribute__ ((aligned(32)));
static __m256i psi_i_m7_p1 __attribute__ ((aligned(32)));
static __m256i psi_i_m7_p3 __attribute__ ((aligned(32)));
static __m256i psi_i_m7_p5 __attribute__ ((aligned(32)));
static __m256i psi_i_m7_p7 __attribute__ ((aligned(32)));
static __m256i psi_i_m5_m7 __attribute__ ((aligned(32)));
static __m256i psi_i_m5_m5 __attribute__ ((aligned(32)));
static __m256i psi_i_m5_m3 __attribute__ ((aligned(32)));
static __m256i psi_i_m5_m1 __attribute__ ((aligned(32)));
static __m256i psi_i_m5_p1 __attribute__ ((aligned(32)));
static __m256i psi_i_m5_p3 __attribute__ ((aligned(32)));
static __m256i psi_i_m5_p5 __attribute__ ((aligned(32)));
static __m256i psi_i_m5_p7 __attribute__ ((aligned(32)));
static __m256i psi_i_m3_m7 __attribute__ ((aligned(32)));
static __m256i psi_i_m3_m5 __attribute__ ((aligned(32)));
static __m256i psi_i_m3_m3 __attribute__ ((aligned(32)));
static __m256i psi_i_m3_m1 __attribute__ ((aligned(32)));
static __m256i psi_i_m3_p1 __attribute__ ((aligned(32)));
static __m256i psi_i_m3_p3 __attribute__ ((aligned(32)));
static __m256i psi_i_m3_p5 __attribute__ ((aligned(32)));
static __m256i psi_i_m3_p7 __attribute__ ((aligned(32)));
static __m256i psi_i_m1_m7 __attribute__ ((aligned(32)));
static __m256i psi_i_m1_m5 __attribute__ ((aligned(32)));
static __m256i psi_i_m1_m3 __attribute__ ((aligned(32)));
static __m256i psi_i_m1_m1 __attribute__ ((aligned(32)));
static __m256i psi_i_m1_p1 __attribute__ ((aligned(32)));
static __m256i psi_i_m1_p3 __attribute__ ((aligned(32)));
static __m256i psi_i_m1_p5 __attribute__ ((aligned(32)));
static __m256i psi_i_m1_p7 __attribute__ ((aligned(32)));
static __m256i psi_i_p1_m7 __attribute__ ((aligned(32)));
static __m256i psi_i_p1_m5 __attribute__ ((aligned(32)));
static __m256i psi_i_p1_m3 __attribute__ ((aligned(32)));
static __m256i psi_i_p1_m1 __attribute__ ((aligned(32)));
static __m256i psi_i_p1_p1 __attribute__ ((aligned(32)));
static __m256i psi_i_p1_p3 __attribute__ ((aligned(32)));
static __m256i psi_i_p1_p5 __attribute__ ((aligned(32)));
static __m256i psi_i_p1_p7 __attribute__ ((aligned(32)));
static __m256i psi_i_p3_m7 __attribute__ ((aligned(32)));
static __m256i psi_i_p3_m5 __attribute__ ((aligned(32)));
static __m256i psi_i_p3_m3 __attribute__ ((aligned(32)));
static __m256i psi_i_p3_m1 __attribute__ ((aligned(32)));
static __m256i psi_i_p3_p1 __attribute__ ((aligned(32)));
static __m256i psi_i_p3_p3 __attribute__ ((aligned(32)));
static __m256i psi_i_p3_p5 __attribute__ ((aligned(32)));
static __m256i psi_i_p3_p7 __attribute__ ((aligned(32)));
static __m256i psi_i_p5_m7 __attribute__ ((aligned(32)));
static __m256i psi_i_p5_m5 __attribute__ ((aligned(32)));
static __m256i psi_i_p5_m3 __attribute__ ((aligned(32)));
static __m256i psi_i_p5_m1 __attribute__ ((aligned(32)));
static __m256i psi_i_p5_p1 __attribute__ ((aligned(32)));
static __m256i psi_i_p5_p3 __attribute__ ((aligned(32)));
static __m256i psi_i_p5_p5 __attribute__ ((aligned(32)));
static __m256i psi_i_p5_p7 __attribute__ ((aligned(32)));
static __m256i psi_i_p7_m7 __attribute__ ((aligned(32)));
static __m256i psi_i_p7_m5 __attribute__ ((aligned(32)));
static __m256i psi_i_p7_m3 __attribute__ ((aligned(32)));
static __m256i psi_i_p7_m1 __attribute__ ((aligned(32)));
static __m256i psi_i_p7_p1 __attribute__ ((aligned(32)));
static __m256i psi_i_p7_p3 __attribute__ ((aligned(32)));
static __m256i psi_i_p7_p5 __attribute__ ((aligned(32)));
static __m256i psi_i_p7_p7 __attribute__ ((aligned(32)));

static __m256i a_r_m7_m7 __attribute__ ((aligned(32)));
static __m256i a_r_m7_m5 __attribute__ ((aligned(32)));
static __m256i a_r_m7_m3 __attribute__ ((aligned(32)));
static __m256i a_r_m7_m1 __attribute__ ((aligned(32)));
static __m256i a_r_m7_p1 __attribute__ ((aligned(32)));
static __m256i a_r_m7_p3 __attribute__ ((aligned(32)));
static __m256i a_r_m7_p5 __attribute__ ((aligned(32)));
static __m256i a_r_m7_p7 __attribute__ ((aligned(32)));
static __m256i a_r_m5_m7 __attribute__ ((aligned(32)));
static __m256i a_r_m5_m5 __attribute__ ((aligned(32)));
static __m256i a_r_m5_m3 __attribute__ ((aligned(32)));
static __m256i a_r_m5_m1 __attribute__ ((aligned(32)));
static __m256i a_r_m5_p1 __attribute__ ((aligned(32)));
static __m256i a_r_m5_p3 __attribute__ ((aligned(32)));
static __m256i a_r_m5_p5 __attribute__ ((aligned(32)));
static __m256i a_r_m5_p7 __attribute__ ((aligned(32)));
static __m256i a_r_m3_m7 __attribute__ ((aligned(32)));
static __m256i a_r_m3_m5 __attribute__ ((aligned(32)));
static __m256i a_r_m3_m3 __attribute__ ((aligned(32)));
static __m256i a_r_m3_m1 __attribute__ ((aligned(32)));
static __m256i a_r_m3_p1 __attribute__ ((aligned(32)));
static __m256i a_r_m3_p3 __attribute__ ((aligned(32)));
static __m256i a_r_m3_p5 __attribute__ ((aligned(32)));
static __m256i a_r_m3_p7 __attribute__ ((aligned(32)));
static __m256i a_r_m1_m7 __attribute__ ((aligned(32)));
static __m256i a_r_m1_m5 __attribute__ ((aligned(32)));
static __m256i a_r_m1_m3 __attribute__ ((aligned(32)));
static __m256i a_r_m1_m1 __attribute__ ((aligned(32)));
static __m256i a_r_m1_p1 __attribute__ ((aligned(32)));
static __m256i a_r_m1_p3 __attribute__ ((aligned(32)));
static __m256i a_r_m1_p5 __attribute__ ((aligned(32)));
static __m256i a_r_m1_p7 __attribute__ ((aligned(32)));
static __m256i a_r_p1_m7 __attribute__ ((aligned(32)));
static __m256i a_r_p1_m5 __attribute__ ((aligned(32)));
static __m256i a_r_p1_m3 __attribute__ ((aligned(32)));
static __m256i a_r_p1_m1 __attribute__ ((aligned(32)));
static __m256i a_r_p1_p1 __attribute__ ((aligned(32)));
static __m256i a_r_p1_p3 __attribute__ ((aligned(32)));
static __m256i a_r_p1_p5 __attribute__ ((aligned(32)));
static __m256i a_r_p1_p7 __attribute__ ((aligned(32)));
static __m256i a_r_p3_m7 __attribute__ ((aligned(32)));
static __m256i a_r_p3_m5 __attribute__ ((aligned(32)));
static __m256i a_r_p3_m3 __attribute__ ((aligned(32)));
static __m256i a_r_p3_m1 __attribute__ ((aligned(32)));
static __m256i a_r_p3_p1 __attribute__ ((aligned(32)));
static __m256i a_r_p3_p3 __attribute__ ((aligned(32)));
static __m256i a_r_p3_p5 __attribute__ ((aligned(32)));
static __m256i a_r_p3_p7 __attribute__ ((aligned(32)));
static __m256i a_r_p5_m7 __attribute__ ((aligned(32)));
static __m256i a_r_p5_m5 __attribute__ ((aligned(32)));
static __m256i a_r_p5_m3 __attribute__ ((aligned(32)));
static __m256i a_r_p5_m1 __attribute__ ((aligned(32)));
static __m256i a_r_p5_p1 __attribute__ ((aligned(32)));
static __m256i a_r_p5_p3 __attribute__ ((aligned(32)));
static __m256i a_r_p5_p5 __attribute__ ((aligned(32)));
static __m256i a_r_p5_p7 __attribute__ ((aligned(32)));
static __m256i a_r_p7_m7 __attribute__ ((aligned(32)));
static __m256i a_r_p7_m5 __attribute__ ((aligned(32)));
static __m256i a_r_p7_m3 __attribute__ ((aligned(32)));
static __m256i a_r_p7_m1 __attribute__ ((aligned(32)));
static __m256i a_r_p7_p1 __attribute__ ((aligned(32)));
static __m256i a_r_p7_p3 __attribute__ ((aligned(32)));
static __m256i a_r_p7_p5 __attribute__ ((aligned(32)));
static __m256i a_r_p7_p7 __attribute__ ((aligned(32)));

static __m256i a_i_m7_m7 __attribute__ ((aligned(32)));
static __m256i a_i_m7_m5 __attribute__ ((aligned(32)));
static __m256i a_i_m7_m3 __attribute__ ((aligned(32)));
static __m256i a_i_m7_m1 __attribute__ ((aligned(32)));
static __m256i a_i_m7_p1 __attribute__ ((aligned(32)));
static __m256i a_i_m7_p3 __attribute__ ((aligned(32)));
static __m256i a_i_m7_p5 __attribute__ ((aligned(32)));
static __m256i a_i_m7_p7 __attribute__ ((aligned(32)));
static __m256i a_i_m5_m7 __attribute__ ((aligned(32)));
static __m256i a_i_m5_m5 __attribute__ ((aligned(32)));
static __m256i a_i_m5_m3 __attribute__ ((aligned(32)));
static __m256i a_i_m5_m1 __attribute__ ((aligned(32)));
static __m256i a_i_m5_p1 __attribute__ ((aligned(32)));
static __m256i a_i_m5_p3 __attribute__ ((aligned(32)));
static __m256i a_i_m5_p5 __attribute__ ((aligned(32)));
static __m256i a_i_m5_p7 __attribute__ ((aligned(32)));
static __m256i a_i_m3_m7 __attribute__ ((aligned(32)));
static __m256i a_i_m3_m5 __attribute__ ((aligned(32)));
static __m256i a_i_m3_m3 __attribute__ ((aligned(32)));
static __m256i a_i_m3_m1 __attribute__ ((aligned(32)));
static __m256i a_i_m3_p1 __attribute__ ((aligned(32)));
static __m256i a_i_m3_p3 __attribute__ ((aligned(32)));
static __m256i a_i_m3_p5 __attribute__ ((aligned(32)));
static __m256i a_i_m3_p7 __attribute__ ((aligned(32)));
static __m256i a_i_m1_m7 __attribute__ ((aligned(32)));
static __m256i a_i_m1_m5 __attribute__ ((aligned(32)));
static __m256i a_i_m1_m3 __attribute__ ((aligned(32)));
static __m256i a_i_m1_m1 __attribute__ ((aligned(32)));
static __m256i a_i_m1_p1 __attribute__ ((aligned(32)));
static __m256i a_i_m1_p3 __attribute__ ((aligned(32)));
static __m256i a_i_m1_p5 __attribute__ ((aligned(32)));
static __m256i a_i_m1_p7 __attribute__ ((aligned(32)));
static __m256i a_i_p1_m7 __attribute__ ((aligned(32)));
static __m256i a_i_p1_m5 __attribute__ ((aligned(32)));
static __m256i a_i_p1_m3 __attribute__ ((aligned(32)));
static __m256i a_i_p1_m1 __attribute__ ((aligned(32)));
static __m256i a_i_p1_p1 __attribute__ ((aligned(32)));
static __m256i a_i_p1_p3 __attribute__ ((aligned(32)));
static __m256i a_i_p1_p5 __attribute__ ((aligned(32)));
static __m256i a_i_p1_p7 __attribute__ ((aligned(32)));
static __m256i a_i_p3_m7 __attribute__ ((aligned(32)));
static __m256i a_i_p3_m5 __attribute__ ((aligned(32)));
static __m256i a_i_p3_m3 __attribute__ ((aligned(32)));
static __m256i a_i_p3_m1 __attribute__ ((aligned(32)));
static __m256i a_i_p3_p1 __attribute__ ((aligned(32)));
static __m256i a_i_p3_p3 __attribute__ ((aligned(32)));
static __m256i a_i_p3_p5 __attribute__ ((aligned(32)));
static __m256i a_i_p3_p7 __attribute__ ((aligned(32)));
static __m256i a_i_p5_m7 __attribute__ ((aligned(32)));
static __m256i a_i_p5_m5 __attribute__ ((aligned(32)));
static __m256i a_i_p5_m3 __attribute__ ((aligned(32)));
static __m256i a_i_p5_m1 __attribute__ ((aligned(32)));
static __m256i a_i_p5_p1 __attribute__ ((aligned(32)));
static __m256i a_i_p5_p3 __attribute__ ((aligned(32)));
static __m256i a_i_p5_p5 __attribute__ ((aligned(32)));
static __m256i a_i_p5_p7 __attribute__ ((aligned(32)));
static __m256i a_i_p7_m7 __attribute__ ((aligned(32)));
static __m256i a_i_p7_m5 __attribute__ ((aligned(32)));
static __m256i a_i_p7_m3 __attribute__ ((aligned(32)));
static __m256i a_i_p7_m1 __attribute__ ((aligned(32)));
static __m256i a_i_p7_p1 __attribute__ ((aligned(32)));
static __m256i a_i_p7_p3 __attribute__ ((aligned(32)));
static __m256i a_i_p7_p5 __attribute__ ((aligned(32)));
static __m256i a_i_p7_p7 __attribute__ ((aligned(32)));

static __m256i psi_a_m7_m7 __attribute__ ((aligned(32)));
static __m256i psi_a_m7_m5 __attribute__ ((aligned(32)));
static __m256i psi_a_m7_m3 __attribute__ ((aligned(32)));
static __m256i psi_a_m7_m1 __attribute__ ((aligned(32)));
static __m256i psi_a_m7_p1 __attribute__ ((aligned(32)));
static __m256i psi_a_m7_p3 __attribute__ ((aligned(32)));
static __m256i psi_a_m7_p5 __attribute__ ((aligned(32)));
static __m256i psi_a_m7_p7 __attribute__ ((aligned(32)));
static __m256i psi_a_m5_m7 __attribute__ ((aligned(32)));
static __m256i psi_a_m5_m5 __attribute__ ((aligned(32)));
static __m256i psi_a_m5_m3 __attribute__ ((aligned(32)));
static __m256i psi_a_m5_m1 __attribute__ ((aligned(32)));
static __m256i psi_a_m5_p1 __attribute__ ((aligned(32)));
static __m256i psi_a_m5_p3 __attribute__ ((aligned(32)));
static __m256i psi_a_m5_p5 __attribute__ ((aligned(32)));
static __m256i psi_a_m5_p7 __attribute__ ((aligned(32)));
static __m256i psi_a_m3_m7 __attribute__ ((aligned(32)));
static __m256i psi_a_m3_m5 __attribute__ ((aligned(32)));
static __m256i psi_a_m3_m3 __attribute__ ((aligned(32)));
static __m256i psi_a_m3_m1 __attribute__ ((aligned(32)));
static __m256i psi_a_m3_p1 __attribute__ ((aligned(32)));
static __m256i psi_a_m3_p3 __attribute__ ((aligned(32)));
static __m256i psi_a_m3_p5 __attribute__ ((aligned(32)));
static __m256i psi_a_m3_p7 __attribute__ ((aligned(32)));
static __m256i psi_a_m1_m7 __attribute__ ((aligned(32)));
static __m256i psi_a_m1_m5 __attribute__ ((aligned(32)));
static __m256i psi_a_m1_m3 __attribute__ ((aligned(32)));
static __m256i psi_a_m1_m1 __attribute__ ((aligned(32)));
static __m256i psi_a_m1_p1 __attribute__ ((aligned(32)));
static __m256i psi_a_m1_p3 __attribute__ ((aligned(32)));
static __m256i psi_a_m1_p5 __attribute__ ((aligned(32)));
static __m256i psi_a_m1_p7 __attribute__ ((aligned(32)));
static __m256i psi_a_p1_m7 __attribute__ ((aligned(32)));
static __m256i psi_a_p1_m5 __attribute__ ((aligned(32)));
static __m256i psi_a_p1_m3 __attribute__ ((aligned(32)));
static __m256i psi_a_p1_m1 __attribute__ ((aligned(32)));
static __m256i psi_a_p1_p1 __attribute__ ((aligned(32)));
static __m256i psi_a_p1_p3 __attribute__ ((aligned(32)));
static __m256i psi_a_p1_p5 __attribute__ ((aligned(32)));
static __m256i psi_a_p1_p7 __attribute__ ((aligned(32)));
static __m256i psi_a_p3_m7 __attribute__ ((aligned(32)));
static __m256i psi_a_p3_m5 __attribute__ ((aligned(32)));
static __m256i psi_a_p3_m3 __attribute__ ((aligned(32)));
static __m256i psi_a_p3_m1 __attribute__ ((aligned(32)));
static __m256i psi_a_p3_p1 __attribute__ ((aligned(32)));
static __m256i psi_a_p3_p3 __attribute__ ((aligned(32)));
static __m256i psi_a_p3_p5 __attribute__ ((aligned(32)));
static __m256i psi_a_p3_p7 __attribute__ ((aligned(32)));
static __m256i psi_a_p5_m7 __attribute__ ((aligned(32)));
static __m256i psi_a_p5_m5 __attribute__ ((aligned(32)));
static __m256i psi_a_p5_m3 __attribute__ ((aligned(32)));
static __m256i psi_a_p5_m1 __attribute__ ((aligned(32)));
static __m256i psi_a_p5_p1 __attribute__ ((aligned(32)));
static __m256i psi_a_p5_p3 __attribute__ ((aligned(32)));
static __m256i psi_a_p5_p5 __attribute__ ((aligned(32)));
static __m256i psi_a_p5_p7 __attribute__ ((aligned(32)));
static __m256i psi_a_p7_m7 __attribute__ ((aligned(32)));
static __m256i psi_a_p7_m5 __attribute__ ((aligned(32)));
static __m256i psi_a_p7_m3 __attribute__ ((aligned(32)));
static __m256i psi_a_p7_m1 __attribute__ ((aligned(32)));
static __m256i psi_a_p7_p1 __attribute__ ((aligned(32)));
static __m256i psi_a_p7_p3 __attribute__ ((aligned(32)));
static __m256i psi_a_p7_p5 __attribute__ ((aligned(32)));
static __m256i psi_a_p7_p7 __attribute__ ((aligned(32)));

static __m256i a_sq_m7_m7 __attribute__ ((aligned(32)));
static __m256i a_sq_m7_m5 __attribute__ ((aligned(32)));
static __m256i a_sq_m7_m3 __attribute__ ((aligned(32)));
static __m256i a_sq_m7_m1 __attribute__ ((aligned(32)));
static __m256i a_sq_m7_p1 __attribute__ ((aligned(32)));
static __m256i a_sq_m7_p3 __attribute__ ((aligned(32)));
static __m256i a_sq_m7_p5 __attribute__ ((aligned(32)));
static __m256i a_sq_m7_p7 __attribute__ ((aligned(32)));
static __m256i a_sq_m5_m7 __attribute__ ((aligned(32)));
static __m256i a_sq_m5_m5 __attribute__ ((aligned(32)));
static __m256i a_sq_m5_m3 __attribute__ ((aligned(32)));
static __m256i a_sq_m5_m1 __attribute__ ((aligned(32)));
static __m256i a_sq_m5_p1 __attribute__ ((aligned(32)));
static __m256i a_sq_m5_p3 __attribute__ ((aligned(32)));
static __m256i a_sq_m5_p5 __attribute__ ((aligned(32)));
static __m256i a_sq_m5_p7 __attribute__ ((aligned(32)));
static __m256i a_sq_m3_m7 __attribute__ ((aligned(32)));
static __m256i a_sq_m3_m5 __attribute__ ((aligned(32)));
static __m256i a_sq_m3_m3 __attribute__ ((aligned(32)));
static __m256i a_sq_m3_m1 __attribute__ ((aligned(32)));
static __m256i a_sq_m3_p1 __attribute__ ((aligned(32)));
static __m256i a_sq_m3_p3 __attribute__ ((aligned(32)));
static __m256i a_sq_m3_p5 __attribute__ ((aligned(32)));
static __m256i a_sq_m3_p7 __attribute__ ((aligned(32)));
static __m256i a_sq_m1_m7 __attribute__ ((aligned(32)));
static __m256i a_sq_m1_m5 __attribute__ ((aligned(32)));
static __m256i a_sq_m1_m3 __attribute__ ((aligned(32)));
static __m256i a_sq_m1_m1 __attribute__ ((aligned(32)));
static __m256i a_sq_m1_p1 __attribute__ ((aligned(32)));
static __m256i a_sq_m1_p3 __attribute__ ((aligned(32)));
static __m256i a_sq_m1_p5 __attribute__ ((aligned(32)));
static __m256i a_sq_m1_p7 __attribute__ ((aligned(32)));
static __m256i a_sq_p1_m7 __attribute__ ((aligned(32)));
static __m256i a_sq_p1_m5 __attribute__ ((aligned(32)));
static __m256i a_sq_p1_m3 __attribute__ ((aligned(32)));
static __m256i a_sq_p1_m1 __attribute__ ((aligned(32)));
static __m256i a_sq_p1_p1 __attribute__ ((aligned(32)));
static __m256i a_sq_p1_p3 __attribute__ ((aligned(32)));
static __m256i a_sq_p1_p5 __attribute__ ((aligned(32)));
static __m256i a_sq_p1_p7 __attribute__ ((aligned(32)));
static __m256i a_sq_p3_m7 __attribute__ ((aligned(32)));
static __m256i a_sq_p3_m5 __attribute__ ((aligned(32)));
static __m256i a_sq_p3_m3 __attribute__ ((aligned(32)));
static __m256i a_sq_p3_m1 __attribute__ ((aligned(32)));
static __m256i a_sq_p3_p1 __attribute__ ((aligned(32)));
static __m256i a_sq_p3_p3 __attribute__ ((aligned(32)));
static __m256i a_sq_p3_p5 __attribute__ ((aligned(32)));
static __m256i a_sq_p3_p7 __attribute__ ((aligned(32)));
static __m256i a_sq_p5_m7 __attribute__ ((aligned(32)));
static __m256i a_sq_p5_m5 __attribute__ ((aligned(32)));
static __m256i a_sq_p5_m3 __attribute__ ((aligned(32)));
static __m256i a_sq_p5_m1 __attribute__ ((aligned(32)));
static __m256i a_sq_p5_p1 __attribute__ ((aligned(32)));
static __m256i a_sq_p5_p3 __attribute__ ((aligned(32)));
static __m256i a_sq_p5_p5 __attribute__ ((aligned(32)));
static __m256i a_sq_p5_p7 __attribute__ ((aligned(32)));
static __m256i a_sq_p7_m7 __attribute__ ((aligned(32)));
static __m256i a_sq_p7_m5 __attribute__ ((aligned(32)));
static __m256i a_sq_p7_m3 __attribute__ ((aligned(32)));
static __m256i a_sq_p7_m1 __attribute__ ((aligned(32)));
static __m256i a_sq_p7_p1 __attribute__ ((aligned(32)));
static __m256i a_sq_p7_p3 __attribute__ ((aligned(32)));
static __m256i a_sq_p7_p5 __attribute__ ((aligned(32)));
static __m256i a_sq_p7_p7 __attribute__ ((aligned(32)));

static __m256i bit_met_m7_m7 __attribute__ ((aligned(32)));
static __m256i bit_met_m7_m5 __attribute__ ((aligned(32)));
static __m256i bit_met_m7_m3 __attribute__ ((aligned(32)));
static __m256i bit_met_m7_m1 __attribute__ ((aligned(32)));
static __m256i bit_met_m7_p1 __attribute__ ((aligned(32)));
static __m256i bit_met_m7_p3 __attribute__ ((aligned(32)));
static __m256i bit_met_m7_p5 __attribute__ ((aligned(32)));
static __m256i bit_met_m7_p7 __attribute__ ((aligned(32)));
static __m256i bit_met_m5_m7 __attribute__ ((aligned(32)));
static __m256i bit_met_m5_m5 __attribute__ ((aligned(32)));
static __m256i bit_met_m5_m3 __attribute__ ((aligned(32)));
static __m256i bit_met_m5_m1 __attribute__ ((aligned(32)));
static __m256i bit_met_m5_p1 __attribute__ ((aligned(32)));
static __m256i bit_met_m5_p3 __attribute__ ((aligned(32)));
static __m256i bit_met_m5_p5 __attribute__ ((aligned(32)));
static __m256i bit_met_m5_p7 __attribute__ ((aligned(32)));
static __m256i bit_met_m3_m7 __attribute__ ((aligned(32)));
static __m256i bit_met_m3_m5 __attribute__ ((aligned(32)));
static __m256i bit_met_m3_m3 __attribute__ ((aligned(32)));
static __m256i bit_met_m3_m1 __attribute__ ((aligned(32)));
static __m256i bit_met_m3_p1 __attribute__ ((aligned(32)));
static __m256i bit_met_m3_p3 __attribute__ ((aligned(32)));
static __m256i bit_met_m3_p5 __attribute__ ((aligned(32)));
static __m256i bit_met_m3_p7 __attribute__ ((aligned(32)));
static __m256i bit_met_m1_m7 __attribute__ ((aligned(32)));
static __m256i bit_met_m1_m5 __attribute__ ((aligned(32)));
static __m256i bit_met_m1_m3 __attribute__ ((aligned(32)));
static __m256i bit_met_m1_m1 __attribute__ ((aligned(32)));
static __m256i bit_met_m1_p1 __attribute__ ((aligned(32)));
static __m256i bit_met_m1_p3 __attribute__ ((aligned(32)));
static __m256i bit_met_m1_p5 __attribute__ ((aligned(32)));
static __m256i bit_met_m1_p7 __attribute__ ((aligned(32)));
static __m256i bit_met_p1_m7 __attribute__ ((aligned(32)));
static __m256i bit_met_p1_m5 __attribute__ ((aligned(32)));
static __m256i bit_met_p1_m3 __attribute__ ((aligned(32)));
static __m256i bit_met_p1_m1 __attribute__ ((aligned(32)));
static __m256i bit_met_p1_p1 __attribute__ ((aligned(32)));
static __m256i bit_met_p1_p3 __attribute__ ((aligned(32)));
static __m256i bit_met_p1_p5 __attribute__ ((aligned(32)));
static __m256i bit_met_p1_p7 __attribute__ ((aligned(32)));
static __m256i bit_met_p3_m7 __attribute__ ((aligned(32)));
static __m256i bit_met_p3_m5 __attribute__ ((aligned(32)));
static __m256i bit_met_p3_m3 __attribute__ ((aligned(32)));
static __m256i bit_met_p3_m1 __attribute__ ((aligned(32)));
static __m256i bit_met_p3_p1 __attribute__ ((aligned(32)));
static __m256i bit_met_p3_p3 __attribute__ ((aligned(32)));
static __m256i bit_met_p3_p5 __attribute__ ((aligned(32)));
static __m256i bit_met_p3_p7 __attribute__ ((aligned(32)));
static __m256i bit_met_p5_m7 __attribute__ ((aligned(32)));
static __m256i bit_met_p5_m5 __attribute__ ((aligned(32)));
static __m256i bit_met_p5_m3 __attribute__ ((aligned(32)));
static __m256i bit_met_p5_m1 __attribute__ ((aligned(32)));
static __m256i bit_met_p5_p1 __attribute__ ((aligned(32)));
static __m256i bit_met_p5_p3 __attribute__ ((aligned(32)));
static __m256i bit_met_p5_p5 __attribute__ ((aligned(32)));
static __m256i bit_met_p5_p7 __attribute__ ((aligned(32)));
static __m256i bit_met_p7_m7 __attribute__ ((aligned(32)));
static __m256i bit_met_p7_m5 __attribute__ ((aligned(32)));
static __m256i bit_met_p7_m3 __attribute__ ((aligned(32)));
static __m256i bit_met_p7_m1 __attribute__ ((aligned(32)));
static __m256i bit_met_p7_p1 __attribute__ ((aligned(32)));
static __m256i bit_met_p7_p3 __attribute__ ((aligned(32)));
static __m256i bit_met_p7_p5 __attribute__ ((aligned(32)));
static __m256i bit_met_p7_p7 __attribute__ ((aligned(32)));

static __m256i  y0_p_1_1 __attribute__ ((aligned(32)));
static __m256i  y0_p_1_3 __attribute__ ((aligned(32)));
static __m256i  y0_p_1_5 __attribute__ ((aligned(32)));
static __m256i  y0_p_1_7 __attribute__ ((aligned(32)));
static __m256i  y0_p_3_1 __attribute__ ((aligned(32)));
static __m256i  y0_p_3_3 __attribute__ ((aligned(32)));
static __m256i  y0_p_3_5 __attribute__ ((aligned(32)));
static __m256i  y0_p_3_7 __attribute__ ((aligned(32)));
static __m256i  y0_p_5_1 __attribute__ ((aligned(32)));
static __m256i  y0_p_5_3 __attribute__ ((aligned(32)));
static __m256i  y0_p_5_5 __attribute__ ((aligned(32)));
static __m256i  y0_p_5_7 __attribute__ ((aligned(32)));
static __m256i  y0_p_7_1 __attribute__ ((aligned(32)));
static __m256i  y0_p_7_3 __attribute__ ((aligned(32)));
static __m256i  y0_p_7_5 __attribute__ ((aligned(32)));
static __m256i  y0_p_7_7 __attribute__ ((aligned(32)));
static __m256i  y0_m_1_1 __attribute__ ((aligned(32)));
static __m256i  y0_m_1_3 __attribute__ ((aligned(32)));
static __m256i  y0_m_1_5 __attribute__ ((aligned(32)));
static __m256i  y0_m_1_7 __attribute__ ((aligned(32)));
static __m256i  y0_m_3_1 __attribute__ ((aligned(32)));
static __m256i  y0_m_3_3 __attribute__ ((aligned(32)));
static __m256i  y0_m_3_5 __attribute__ ((aligned(32)));
static __m256i  y0_m_3_7 __attribute__ ((aligned(32)));
static __m256i  y0_m_5_1 __attribute__ ((aligned(32)));
static __m256i  y0_m_5_3 __attribute__ ((aligned(32)));
static __m256i  y0_m_5_5 __attribute__ ((aligned(32)));
static __m256i  y0_m_5_7 __attribute__ ((aligned(32)));
static __m256i  y0_m_7_1 __attribute__ ((aligned(32)));
static __m256i  y0_m_7_3 __attribute__ ((aligned(32)));
static __m256i  y0_m_7_5 __attribute__ ((aligned(32)));
static __m256i  y0_m_7_7 __attribute__ ((aligned(32)));

static __m256i  xmm0 __attribute__ ((aligned(32)));
static __m256i  xmm1 __attribute__ ((aligned(32)));
static __m256i  xmm2 __attribute__ ((aligned(32)));
static __m256i  xmm3 __attribute__ ((aligned(32)));
static __m256i  xmm4 __attribute__ ((aligned(32)));
static __m256i  xmm5 __attribute__ ((aligned(32)));
static __m256i  xmm6 __attribute__ ((aligned(32)));
static __m256i  xmm7 __attribute__ ((aligned(32)));
static __m256i  xmm8 __attribute__ ((aligned(32)));

static __m256i  y0r __attribute__ ((aligned(32)));
static __m256i  y0i __attribute__ ((aligned(32)));
static __m256i  y1r __attribute__ ((aligned(32)));
static __m256i  y1i __attribute__ ((aligned(32)));
static __m256i  y2r __attribute__ ((aligned(32)));
static __m256i  y2i __attribute__ ((aligned(32)));

static __m256i  logmax_num_re0 __attribute__ ((aligned(32)));
static __m256i  logmax_den_re0 __attribute__ ((aligned(32)));

static __m256i tmp_result  __attribute__ ((aligned(32)));
static __m256i tmp_result2 __attribute__ ((aligned(32)));
static __m256i tmp_result3 __attribute__ ((aligned(32)));
static __m256i tmp_result4 __attribute__ ((aligned(32)));

//==============================================================================================
// Auxiliary Makros

// calculate interference magnitude
#define interference_abs_epi16(psi,int_ch_mag,int_mag,c1,c2) tmp_result = _mm256_cmpgt_epi16(int_ch_mag,psi); tmp_result2 = _mm256_xor_si256(tmp_result,(*(__m256i*)&ones256[0])); tmp_result = _mm256_and_si256(tmp_result,c1); tmp_result2 = _mm256_and_si256(tmp_result2,c2); int_mag = _mm256_or_si256(tmp_result,tmp_result2);

// calculate interference magnitude
// tmp_result = ones in shorts corr. to interval 2<=x<=4, tmp_result2 interval < 2, tmp_result3 interval 4<x<6 and tmp_result4 interval x>6
#define interference_abs_64qam_epi16(psi,int_ch_mag,int_two_ch_mag,int_three_ch_mag,a,c1,c3,c5,c7) tmp_result = _mm256_cmpgt_epi16(int_two_ch_mag,psi); tmp_result3 = _mm256_xor_si256(tmp_result,(*(__m256i*)&ones256[0])); tmp_result2 = _mm256_cmpgt_epi16(int_ch_mag,psi); tmp_result = _mm256_xor_si256(tmp_result,tmp_result2); tmp_result4 = _mm256_cmpgt_epi16(psi,int_three_ch_mag); tmp_result3 = _mm256_xor_si256(tmp_result3,tmp_result4); tmp_result = _mm256_and_si256(tmp_result,c3); tmp_result2 = _mm256_and_si256(tmp_result2,c1); tmp_result3 = _mm256_and_si256(tmp_result3,c5); tmp_result4 = _mm256_and_si256(tmp_result4,c7); tmp_result = _mm256_or_si256(tmp_result,tmp_result2); tmp_result3 = _mm256_or_si256(tmp_result3,tmp_result4); a = _mm256_or_si256(tmp_result,tmp_result3);

// calculates psi_a = psi_r*a_r + psi_i*a_i
#define prodsum_psi_a_epi16(psi_r,a_r,psi_i,a_i,psi_a) tmp_result = _mm256_mulhi_epi16(psi_r,a_r); tmp_result = _mm256_slli_epi16(tmp_result,1); tmp_result2 = _mm256_mulhi_epi16(psi_i,a_i); tmp_result2 = _mm256_slli_epi16(tmp_result2,1); psi_a = _mm256_adds_epi16(tmp_result,tmp_result2);

// calculates a_sq = int_ch_mag*(a_r^2 + a_i^2)*scale_factor
#define square_a_epi16(a_r,a_i,int_ch_mag,scale_factor,a_sq) tmp_result = _mm256_mulhi_epi16(a_r,a_r); tmp_result = _mm256_slli_epi16(tmp_result,1); tmp_result = _mm256_mulhi_epi16(tmp_result,scale_factor); tmp_result = _mm256_slli_epi16(tmp_result,1); tmp_result = _mm256_mulhi_epi16(tmp_result,int_ch_mag); tmp_result = _mm256_slli_epi16(tmp_result,1); tmp_result2 = _mm256_mulhi_epi16(a_i,a_i); tmp_result2 = _mm256_slli_epi16(tmp_result2,1); tmp_result2 = _mm256_mulhi_epi16(tmp_result2,scale_factor); tmp_result2 = _mm256_slli_epi16(tmp_result2,1); tmp_result2 = _mm256_mulhi_epi16(tmp_result2,int_ch_mag); tmp_result2 = _mm256_slli_epi16(tmp_result2,1); a_sq = _mm256_adds_epi16(tmp_result,tmp_result2);

// calculates a_sq = int_ch_mag*(a_r^2 + a_i^2)*scale_factor for 64-QAM
#define square_a_64qam_epi16(a_r,a_i,int_ch_mag,scale_factor,a_sq)  tmp_result = _mm256_mulhi_epi16(a_r,a_r); tmp_result = _mm256_slli_epi16(tmp_result,1); tmp_result = _mm256_mulhi_epi16(tmp_result,scale_factor); tmp_result = _mm256_slli_epi16(tmp_result,3); tmp_result = _mm256_mulhi_epi16(tmp_result,int_ch_mag); tmp_result = _mm256_slli_epi16(tmp_result,1); tmp_result2 = _mm256_mulhi_epi16(a_i,a_i); tmp_result2 = _mm256_slli_epi16(tmp_result2,1); tmp_result2 = _mm256_mulhi_epi16(tmp_result2,scale_factor); tmp_result2 = _mm256_slli_epi16(tmp_result2,3); tmp_result2 = _mm256_mulhi_epi16(tmp_result2,int_ch_mag); tmp_result2 = _mm256_slli_epi16(tmp_result2,1); a_sq = _mm256_adds_epi16(tmp_result,tmp_result2);

void seperate_real_imag_parts(__m256i *out_re,
                              __m256i *out_im,
                              __m256i in0,
                              __m256i in1)
{
    __m256i tmp0;
    __m256i tmp1;

    in0 = _mm256_shufflelo_epi16(in0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    in0 = _mm256_shufflehi_epi16(in0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    in0 = _mm256_shuffle_epi32(in0,0xd8); //_MM_SHUFFLE(0,2,1,3));

    in1 = _mm256_shufflelo_epi16(in1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    in1 = _mm256_shufflehi_epi16(in1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    in1 = _mm256_shuffle_epi32(in1,0xd8); //_MM_SHUFFLE(0,2,1,3));

    //in0 = [Re(0,1,2,3)   Im(0,1,2,3)   Re(4,5,6,7)     Im(4,5,6,7)]
    //in0 = [Re(8,9,10,11) Im(8,9,10,11) Re(12,13,14,15) Im(12,13,14,15)]

    tmp0 = _mm256_unpacklo_epi64(in0, in1);
    //axmm2 = [Re(0,1,2,3) Re(8,9,10,11) Re(4,5,6,7) Re(12,13,14,15)]
    tmp0 = _mm256_permute4x64_epi64(tmp0,0xd8); // Re(rho)

    tmp1 = _mm256_unpackhi_epi64(in0, in1);
    //axmm3 = [Im(0,1,2,3) Im(8,9,10,11) Im(4,5,6,7) Im(12,13,14,15)]
    tmp1 = _mm256_permute4x64_epi64(tmp1,0xd8); // Im(rho)

    *out_re = tmp0;
    *out_im = tmp1;
}

void qam64_qam16_avx2(short *stream0_in,
                      short *stream1_in,
                      short *ch_mag,
                      short *ch_mag_i,
                      short *stream0_out,
                      short *rho01,
                      int length
    )
{

  /*
    Author: S. Wagner
    Date: 31-07-12

    Input:
    stream0_in:  MF filter for 1st stream, i.e., y0=h0'*y
    stream1_in:  MF filter for 2nd stream, i.e., y1=h1'*y
    ch_mag:      4*h0/sqrt(42), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
    ch_mag_i:    4*h1/sqrt(42), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
    rho01:       Channel cross correlation, i.e., h1'*h0

    Output:
    stream0_out: output LLRs for 1st stream
  */

#if defined(__x86_64__) || defined(__i386__)

  __m256i *rho01_256i      = (__m256i *)rho01;
  __m256i *stream0_256i_in = (__m256i *)stream0_in;
  __m256i *stream1_256i_in = (__m256i *)stream1_in;
  __m256i *ch_mag_256i     = (__m256i *)ch_mag;
  __m256i *ch_mag_256i_i   = (__m256i *)ch_mag_i;

  __m256i ONE_OVER_SQRT_42 = _mm256_broadcastw_epi16(_mm_set1_epi16(10112)); // round(1/sqrt(42)*2^16)
  __m256i THREE_OVER_SQRT_42 = _mm256_broadcastw_epi16(_mm_set1_epi16(30337)); // round(3/sqrt(42)*2^16)
  __m256i FIVE_OVER_SQRT_42 = _mm256_broadcastw_epi16(_mm_set1_epi16(25281)); // round(5/sqrt(42)*2^15)
  __m256i SEVEN_OVER_SQRT_42 = _mm256_broadcastw_epi16(_mm_set1_epi16(17697)); // round(5/sqrt(42)*2^15)
  __m256i FORTYNINE_OVER_FOUR_SQRT_42 = _mm256_broadcastw_epi16(_mm_set1_epi16(30969)); // round(49/(4*sqrt(42))*2^14), Q2.14
  __m256i THIRTYSEVEN_OVER_FOUR_SQRT_42 = _mm256_broadcastw_epi16(_mm_set1_epi16(23385)); // round(37/(4*sqrt(42))*2^14), Q2.14
  __m256i TWENTYFIVE_OVER_FOUR_SQRT_42 = _mm256_broadcastw_epi16(_mm_set1_epi16(31601)); // round(25/(4*sqrt(42))*2^15)
  __m256i TWENTYNINE_OVER_FOUR_SQRT_42 = _mm256_broadcastw_epi16(_mm_set1_epi16(18329)); // round(29/(4*sqrt(42))*2^15), Q2.14
  __m256i SEVENTEEN_OVER_FOUR_SQRT_42 = _mm256_broadcastw_epi16(_mm_set1_epi16(21489)); // round(17/(4*sqrt(42))*2^15)
  __m256i NINE_OVER_FOUR_SQRT_42 = _mm256_broadcastw_epi16(_mm_set1_epi16(11376)); // round(9/(4*sqrt(42))*2^15)
  __m256i THIRTEEN_OVER_FOUR_SQRT_42 = _mm256_broadcastw_epi16(_mm_set1_epi16(16433)); // round(13/(4*sqrt(42))*2^15)
  __m256i FIVE_OVER_FOUR_SQRT_42 = _mm256_broadcastw_epi16(_mm_set1_epi16(6320)); // round(5/(4*sqrt(42))*2^15)
  __m256i ONE_OVER_FOUR_SQRT_42 = _mm256_broadcastw_epi16(_mm_set1_epi16(1264)); // round(1/(4*sqrt(42))*2^15)
  __m256i ONE_OVER_SQRT_10_Q15 = _mm256_broadcastw_epi16(_mm_set1_epi16(10362)); // round(1/sqrt(10)*2^15)
  __m256i THREE_OVER_SQRT_10 = _mm256_broadcastw_epi16(_mm_set1_epi16(31086)); // round(3/sqrt(10)*2^15)
  __m256i SQRT_10_OVER_FOUR = _mm256_broadcastw_epi16(_mm_set1_epi16(25905)); // round(sqrt(10)/4*2^15)


  __m256i ch_mag_int;
  __m256i ch_mag_des;
  __m256i ch_mag_98_over_42_with_sigma2;
  __m256i ch_mag_74_over_42_with_sigma2;
  __m256i ch_mag_58_over_42_with_sigma2;
  __m256i ch_mag_50_over_42_with_sigma2;
  __m256i ch_mag_34_over_42_with_sigma2;
  __m256i ch_mag_18_over_42_with_sigma2;
  __m256i ch_mag_26_over_42_with_sigma2;
  __m256i ch_mag_10_over_42_with_sigma2;
  __m256i ch_mag_2_over_42_with_sigma2;
  __m256i  y0r_one_over_sqrt_21;
  __m256i  y0r_three_over_sqrt_21;
  __m256i  y0r_five_over_sqrt_21;
  __m256i  y0r_seven_over_sqrt_21;
  __m256i  y0i_one_over_sqrt_21;
  __m256i  y0i_three_over_sqrt_21;
  __m256i  y0i_five_over_sqrt_21;
  __m256i  y0i_seven_over_sqrt_21;

#elif defined(__arm__)

#endif
  int i,j;
  uint32_t len256 = (length)>>3;

  for (i=0; i<len256; i+=2) {

#if defined(__x86_64__) || defined(__i386__)
    // Get rho
      /*
    xmm0 = rho01_128i[i];
    xmm1 = rho01_128i[i+1];
    xmm0 = _mm_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    //xmm0 = [Re(0,1) Re(2,3) Im(0,1) Im(2,3)]
    //xmm1 = [Re(4,5) Re(6,7) Im(4,5) Im(6,7)]
    xmm2 = _mm_unpacklo_epi64(xmm0,xmm1); // Re(rho)
    xmm3 = _mm_unpackhi_epi64(xmm0,xmm1); // Im(rho)
      */
    seperate_real_imag_parts(&xmm2, &xmm3, rho01_256i[i], rho01_256i[i+1]);

    rho_rpi = _mm256_adds_epi16(xmm2,xmm3); // rho = Re(rho) + Im(rho)
    rho_rmi = _mm256_subs_epi16(xmm2,xmm3); // rho* = Re(rho) - Im(rho)

    // Compute the different rhos
    rho_rpi_1_1 = _mm256_mulhi_epi16(rho_rpi, ONE_OVER_SQRT_42);
    rho_rmi_1_1 = _mm256_mulhi_epi16(rho_rmi, ONE_OVER_SQRT_42);
    rho_rpi_3_3 = _mm256_mulhi_epi16(rho_rpi, THREE_OVER_SQRT_42);
    rho_rmi_3_3 = _mm256_mulhi_epi16(rho_rmi, THREE_OVER_SQRT_42);
    rho_rpi_5_5 = _mm256_mulhi_epi16(rho_rpi, FIVE_OVER_SQRT_42);
    rho_rmi_5_5 = _mm256_mulhi_epi16(rho_rmi, FIVE_OVER_SQRT_42);
    rho_rpi_7_7 = _mm256_mulhi_epi16(rho_rpi, SEVEN_OVER_SQRT_42);
    rho_rmi_7_7 = _mm256_mulhi_epi16(rho_rmi, SEVEN_OVER_SQRT_42);

    rho_rpi_5_5 = _mm256_slli_epi16(rho_rpi_5_5, 1);
    rho_rmi_5_5 = _mm256_slli_epi16(rho_rmi_5_5, 1);
    rho_rpi_7_7 = _mm256_slli_epi16(rho_rpi_7_7, 2);
    rho_rmi_7_7 = _mm256_slli_epi16(rho_rmi_7_7, 2);

    xmm4 = _mm256_mulhi_epi16(xmm2, ONE_OVER_SQRT_42);
    xmm5 = _mm256_mulhi_epi16(xmm3, ONE_OVER_SQRT_42);
    xmm6 = _mm256_mulhi_epi16(xmm3, THREE_OVER_SQRT_42);
    xmm7 = _mm256_mulhi_epi16(xmm3, FIVE_OVER_SQRT_42);
    xmm8 = _mm256_mulhi_epi16(xmm3, SEVEN_OVER_SQRT_42);
    xmm7 = _mm256_slli_epi16(xmm7, 1);
    xmm8 = _mm256_slli_epi16(xmm8, 2);

    rho_rpi_1_3 = _mm256_adds_epi16(xmm4, xmm6);
    rho_rmi_1_3 = _mm256_subs_epi16(xmm4, xmm6);
    rho_rpi_1_5 = _mm256_adds_epi16(xmm4, xmm7);
    rho_rmi_1_5 = _mm256_subs_epi16(xmm4, xmm7);
    rho_rpi_1_7 = _mm256_adds_epi16(xmm4, xmm8);
    rho_rmi_1_7 = _mm256_subs_epi16(xmm4, xmm8);

    xmm4 = _mm256_mulhi_epi16(xmm2, THREE_OVER_SQRT_42);
    rho_rpi_3_1 = _mm256_adds_epi16(xmm4, xmm5);
    rho_rmi_3_1 = _mm256_subs_epi16(xmm4, xmm5);
    rho_rpi_3_5 = _mm256_adds_epi16(xmm4, xmm7);
    rho_rmi_3_5 = _mm256_subs_epi16(xmm4, xmm7);
    rho_rpi_3_7 = _mm256_adds_epi16(xmm4, xmm8);
    rho_rmi_3_7 = _mm256_subs_epi16(xmm4, xmm8);

    xmm4 = _mm256_mulhi_epi16(xmm2, FIVE_OVER_SQRT_42);
    xmm4 = _mm256_slli_epi16(xmm4, 1);
    rho_rpi_5_1 = _mm256_adds_epi16(xmm4, xmm5);
    rho_rmi_5_1 = _mm256_subs_epi16(xmm4, xmm5);
    rho_rpi_5_3 = _mm256_adds_epi16(xmm4, xmm6);
    rho_rmi_5_3 = _mm256_subs_epi16(xmm4, xmm6);
    rho_rpi_5_7 = _mm256_adds_epi16(xmm4, xmm8);
    rho_rmi_5_7 = _mm256_subs_epi16(xmm4, xmm8);

    xmm4 = _mm256_mulhi_epi16(xmm2, SEVEN_OVER_SQRT_42);
    xmm4 = _mm256_slli_epi16(xmm4, 2);
    rho_rpi_7_1 = _mm256_adds_epi16(xmm4, xmm5);
    rho_rmi_7_1 = _mm256_subs_epi16(xmm4, xmm5);
    rho_rpi_7_3 = _mm256_adds_epi16(xmm4, xmm6);
    rho_rmi_7_3 = _mm256_subs_epi16(xmm4, xmm6);
    rho_rpi_7_5 = _mm256_adds_epi16(xmm4, xmm7);
    rho_rmi_7_5 = _mm256_subs_epi16(xmm4, xmm7);

    // Rearrange interfering MF output
    /*
    xmm0 = stream1_128i_in[i];
    xmm1 = stream1_128i_in[i+1];
    xmm0 = _mm256_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm256_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm256_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm256_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm256_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm256_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    //xmm0 = [Re(0,1) Re(2,3) Im(0,1) Im(2,3)]
    //xmm1 = [Re(4,5) Re(6,7) Im(4,5) Im(6,7)]
    y1r = _mm256_unpacklo_epi64(xmm0,xmm1); //[y1r(1),y1r(2),y1r(3),y1r(4)]
    y1i = _mm256_unpackhi_epi64(xmm0,xmm1); //[y1i(1),y1i(2),y1i(3),y1i(4)]
    */

    seperate_real_imag_parts(&y1r, &y1i, stream1_256i_in[i], stream1_256i_in[i+1]);

    // Psi_r calculation from rho_rpi or rho_rmi
    xmm0 = _mm256_broadcastw_epi16(_mm_set1_epi16(0));// ZERO for abs_pi16
    xmm2 = _mm256_subs_epi16(rho_rpi_7_7, y1r);
    psi_r_p7_p7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_7_5, y1r);
    psi_r_p7_p5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_7_3, y1r);
    psi_r_p7_p3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_7_1, y1r);
    psi_r_p7_p1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_7_1, y1r);
    psi_r_p7_m1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_7_3, y1r);
    psi_r_p7_m3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_7_5, y1r);
    psi_r_p7_m5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_7_7, y1r);
    psi_r_p7_m7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_5_7, y1r);
    psi_r_p5_p7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_5_5, y1r);
    psi_r_p5_p5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_5_3, y1r);
    psi_r_p5_p3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_5_1, y1r);
    psi_r_p5_p1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_5_1, y1r);
    psi_r_p5_m1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_5_3, y1r);
    psi_r_p5_m3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_5_5, y1r);
    psi_r_p5_m5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_5_7, y1r);
    psi_r_p5_m7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_3_7, y1r);
    psi_r_p3_p7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_3_5, y1r);
    psi_r_p3_p5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_3_3, y1r);
    psi_r_p3_p3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_3_1, y1r);
    psi_r_p3_p1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_3_1, y1r);
    psi_r_p3_m1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_3_3, y1r);
    psi_r_p3_m3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_3_5, y1r);
    psi_r_p3_m5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_3_7, y1r);
    psi_r_p3_m7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_1_7, y1r);
    psi_r_p1_p7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_1_5, y1r);
    psi_r_p1_p5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_1_3, y1r);
    psi_r_p1_p3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_1_1, y1r);
    psi_r_p1_p1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_1_1, y1r);
    psi_r_p1_m1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_1_3, y1r);
    psi_r_p1_m3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_1_5, y1r);
    psi_r_p1_m5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_1_7, y1r);
    psi_r_p1_m7 = _mm256_abs_epi16(xmm2);

    xmm2 = _mm256_adds_epi16(rho_rmi_1_7, y1r);
    psi_r_m1_p7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_1_5, y1r);
    psi_r_m1_p5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_1_3, y1r);
    psi_r_m1_p3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_1_1, y1r);
    psi_r_m1_p1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_1_1, y1r);
    psi_r_m1_m1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_1_3, y1r);
    psi_r_m1_m3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_1_5, y1r);
    psi_r_m1_m5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_1_7, y1r);
    psi_r_m1_m7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_3_7, y1r);
    psi_r_m3_p7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_3_5, y1r);
    psi_r_m3_p5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_3_3, y1r);
    psi_r_m3_p3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_3_1, y1r);
    psi_r_m3_p1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_3_1, y1r);
    psi_r_m3_m1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_3_3, y1r);
    psi_r_m3_m3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_3_5, y1r);
    psi_r_m3_m5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_3_7, y1r);
    psi_r_m3_m7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_5_7, y1r);
    psi_r_m5_p7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_5_5, y1r);
    psi_r_m5_p5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_5_3, y1r);
    psi_r_m5_p3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_5_1, y1r);
    psi_r_m5_p1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_5_1, y1r);
    psi_r_m5_m1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_5_3, y1r);
    psi_r_m5_m3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_5_5, y1r);
    psi_r_m5_m5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_5_7, y1r);
    psi_r_m5_m7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_7_7, y1r);
    psi_r_m7_p7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_7_5, y1r);
    psi_r_m7_p5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_7_3, y1r);
    psi_r_m7_p3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_7_1, y1r);
    psi_r_m7_p1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_7_1, y1r);
    psi_r_m7_m1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_7_3, y1r);
    psi_r_m7_m3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_7_5, y1r);
    psi_r_m7_m5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_7_7, y1r);
    psi_r_m7_m7 = _mm256_abs_epi16(xmm2);

    // Psi_i calculation from rho_rpi or rho_rmi
    xmm2 = _mm256_subs_epi16(rho_rmi_7_7, y1i);
    psi_i_p7_p7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_5_7, y1i);
    psi_i_p7_p5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_3_7, y1i);
    psi_i_p7_p3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_1_7, y1i);
    psi_i_p7_p1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_1_7, y1i);
    psi_i_p7_m1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_3_7, y1i);
    psi_i_p7_m3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_5_7, y1i);
    psi_i_p7_m5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_7_7, y1i);
    psi_i_p7_m7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_7_5, y1i);
    psi_i_p5_p7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_5_5, y1i);
    psi_i_p5_p5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_3_5, y1i);
    psi_i_p5_p3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_1_5, y1i);
    psi_i_p5_p1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_1_5, y1i);
    psi_i_p5_m1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_3_5, y1i);
    psi_i_p5_m3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_5_5, y1i);
    psi_i_p5_m5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_7_5, y1i);
    psi_i_p5_m7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_7_3, y1i);
    psi_i_p3_p7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_5_3, y1i);
    psi_i_p3_p5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_3_3, y1i);
    psi_i_p3_p3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_1_3, y1i);
    psi_i_p3_p1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_1_3, y1i);
    psi_i_p3_m1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_3_3, y1i);
    psi_i_p3_m3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_5_3, y1i);
    psi_i_p3_m5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_7_3, y1i);
    psi_i_p3_m7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_7_1, y1i);
    psi_i_p1_p7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_5_1, y1i);
    psi_i_p1_p5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_3_1, y1i);
    psi_i_p1_p3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_1_1, y1i);
    psi_i_p1_p1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_1_1, y1i);
    psi_i_p1_m1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_3_1, y1i);
    psi_i_p1_m3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_5_1, y1i);
    psi_i_p1_m5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_7_1, y1i);
    psi_i_p1_m7 = _mm256_abs_epi16(xmm2);

    xmm2 = _mm256_subs_epi16(rho_rpi_7_1, y1i);
    psi_i_m1_p7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_5_1, y1i);
    psi_i_m1_p5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_3_1, y1i);
    psi_i_m1_p3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_1_1, y1i);
    psi_i_m1_p1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_1_1, y1i);
    psi_i_m1_m1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_3_1, y1i);
    psi_i_m1_m3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_5_1, y1i);
    psi_i_m1_m5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_7_1, y1i);
    psi_i_m1_m7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_7_3, y1i);
    psi_i_m3_p7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_5_3, y1i);
    psi_i_m3_p5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_3_3, y1i);
    psi_i_m3_p3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_1_3, y1i);
    psi_i_m3_p1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_1_3, y1i);
    psi_i_m3_m1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_3_3, y1i);
    psi_i_m3_m3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_5_3, y1i);
    psi_i_m3_m5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_7_3, y1i);
    psi_i_m3_m7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_7_5, y1i);
    psi_i_m5_p7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_5_5, y1i);
    psi_i_m5_p5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_3_5, y1i);
    psi_i_m5_p3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_1_5, y1i);
    psi_i_m5_p1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_1_5, y1i);
    psi_i_m5_m1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_3_5, y1i);
    psi_i_m5_m3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_5_5, y1i);
    psi_i_m5_m5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_7_5, y1i);
    psi_i_m5_m7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_7_7, y1i);
    psi_i_m7_p7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_5_7, y1i);
    psi_i_m7_p5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_3_7, y1i);
    psi_i_m7_p3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_1_7, y1i);
    psi_i_m7_p1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_1_7, y1i);
    psi_i_m7_m1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_3_7, y1i);
    psi_i_m7_m3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_5_7, y1i);
    psi_i_m7_m5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_7_7, y1i);
    psi_i_m7_m7 = _mm256_abs_epi16(xmm2);

/*
    // Rearrange desired MF output
    xmm0 = stream0_128i_in[i];
    xmm1 = stream0_128i_in[i+1];
    xmm0 = _mm256_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm256_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm256_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm256_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm256_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm256_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    //xmm0 = [Re(0,1) Re(2,3) Im(0,1) Im(2,3)]
    //xmm1 = [Re(4,5) Re(6,7) Im(4,5) Im(6,7)]
    y0r = _mm256_unpacklo_epi64(xmm0,xmm1); // = [y0r(1),y0r(2),y0r(3),y0r(4)]
    y0i = _mm256_unpackhi_epi64(xmm0,xmm1);
*/
    seperate_real_imag_parts(&y0r, &y0i, stream0_256i_in[i], stream0_256i_in[i+1]);

    /*
    // Rearrange desired channel magnitudes
    xmm2 = ch_mag_128i[i]; // = [|h|^2(1),|h|^2(1),|h|^2(2),|h|^2(2)]*(2/sqrt(10))
    xmm3 = ch_mag_128i[i+1]; // = [|h|^2(3),|h|^2(3),|h|^2(4),|h|^2(4)]*(2/sqrt(10))
    xmm2 = _mm256_shufflelo_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm2 = _mm256_shufflehi_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm2 = _mm256_shuffle_epi32(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm256_shufflelo_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm256_shufflehi_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm256_shuffle_epi32(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    ch_mag_des = _mm256_unpacklo_epi64(xmm2,xmm3);
    */

    seperate_real_imag_parts(&ch_mag_des, &xmm2, ch_mag_256i[i], ch_mag_256i[i+1]);

    // Rearrange interfering channel magnitudes
    /*
    xmm2 = ch_mag_128i_i[i];
    xmm3 = ch_mag_128i_i[i+1];
    xmm2 = _mm256_shufflelo_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm2 = _mm256_shufflehi_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm2 = _mm256_shuffle_epi32(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm256_shufflelo_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm256_shufflehi_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm256_shuffle_epi32(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    ch_mag_int  = _mm256_unpacklo_epi64(xmm2,xmm3);
    */

    seperate_real_imag_parts(&ch_mag_int, &xmm2, ch_mag_256i_i[i], ch_mag_256i_i[i+1]);

    y0r_one_over_sqrt_21   = _mm256_mulhi_epi16(y0r, ONE_OVER_SQRT_42);
    y0r_three_over_sqrt_21 = _mm256_mulhi_epi16(y0r, THREE_OVER_SQRT_42);
    y0r_five_over_sqrt_21  = _mm256_mulhi_epi16(y0r, FIVE_OVER_SQRT_42);
    y0r_five_over_sqrt_21  = _mm256_slli_epi16(y0r_five_over_sqrt_21, 1);
    y0r_seven_over_sqrt_21 = _mm256_mulhi_epi16(y0r, SEVEN_OVER_SQRT_42);
    y0r_seven_over_sqrt_21 = _mm256_slli_epi16(y0r_seven_over_sqrt_21, 2); // Q2.14

    y0i_one_over_sqrt_21   = _mm256_mulhi_epi16(y0i, ONE_OVER_SQRT_42);
    y0i_three_over_sqrt_21 = _mm256_mulhi_epi16(y0i, THREE_OVER_SQRT_42);
    y0i_five_over_sqrt_21  = _mm256_mulhi_epi16(y0i, FIVE_OVER_SQRT_42);
    y0i_five_over_sqrt_21  = _mm256_slli_epi16(y0i_five_over_sqrt_21, 1);
    y0i_seven_over_sqrt_21 = _mm256_mulhi_epi16(y0i, SEVEN_OVER_SQRT_42);
    y0i_seven_over_sqrt_21 = _mm256_slli_epi16(y0i_seven_over_sqrt_21, 2); // Q2.14

    y0_p_7_1 = _mm256_adds_epi16(y0r_seven_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_p_7_3 = _mm256_adds_epi16(y0r_seven_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_p_7_5 = _mm256_adds_epi16(y0r_seven_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_p_7_7 = _mm256_adds_epi16(y0r_seven_over_sqrt_21, y0i_seven_over_sqrt_21);
    y0_p_5_1 = _mm256_adds_epi16(y0r_five_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_p_5_3 = _mm256_adds_epi16(y0r_five_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_p_5_5 = _mm256_adds_epi16(y0r_five_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_p_5_7 = _mm256_adds_epi16(y0r_five_over_sqrt_21, y0i_seven_over_sqrt_21);
    y0_p_3_1 = _mm256_adds_epi16(y0r_three_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_p_3_3 = _mm256_adds_epi16(y0r_three_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_p_3_5 = _mm256_adds_epi16(y0r_three_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_p_3_7 = _mm256_adds_epi16(y0r_three_over_sqrt_21, y0i_seven_over_sqrt_21);
    y0_p_1_1 = _mm256_adds_epi16(y0r_one_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_p_1_3 = _mm256_adds_epi16(y0r_one_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_p_1_5 = _mm256_adds_epi16(y0r_one_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_p_1_7 = _mm256_adds_epi16(y0r_one_over_sqrt_21, y0i_seven_over_sqrt_21);

    y0_m_1_1 = _mm256_subs_epi16(y0r_one_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_m_1_3 = _mm256_subs_epi16(y0r_one_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_m_1_5 = _mm256_subs_epi16(y0r_one_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_m_1_7 = _mm256_subs_epi16(y0r_one_over_sqrt_21, y0i_seven_over_sqrt_21);
    y0_m_3_1 = _mm256_subs_epi16(y0r_three_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_m_3_3 = _mm256_subs_epi16(y0r_three_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_m_3_5 = _mm256_subs_epi16(y0r_three_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_m_3_7 = _mm256_subs_epi16(y0r_three_over_sqrt_21, y0i_seven_over_sqrt_21);
    y0_m_5_1 = _mm256_subs_epi16(y0r_five_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_m_5_3 = _mm256_subs_epi16(y0r_five_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_m_5_5 = _mm256_subs_epi16(y0r_five_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_m_5_7 = _mm256_subs_epi16(y0r_five_over_sqrt_21, y0i_seven_over_sqrt_21);
    y0_m_7_1 = _mm256_subs_epi16(y0r_seven_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_m_7_3 = _mm256_subs_epi16(y0r_seven_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_m_7_5 = _mm256_subs_epi16(y0r_seven_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_m_7_7 = _mm256_subs_epi16(y0r_seven_over_sqrt_21, y0i_seven_over_sqrt_21);

    interference_abs_epi16(psi_r_p7_p7, ch_mag_int, a_r_p7_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p7_p5, ch_mag_int, a_r_p7_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p7_p3, ch_mag_int, a_r_p7_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p7_p1, ch_mag_int, a_r_p7_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p7_m1, ch_mag_int, a_r_p7_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p7_m3, ch_mag_int, a_r_p7_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p7_m5, ch_mag_int, a_r_p7_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p7_m7, ch_mag_int, a_r_p7_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p5_p7, ch_mag_int, a_r_p5_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p5_p5, ch_mag_int, a_r_p5_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p5_p3, ch_mag_int, a_r_p5_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p5_p1, ch_mag_int, a_r_p5_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p5_m1, ch_mag_int, a_r_p5_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p5_m3, ch_mag_int, a_r_p5_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p5_m5, ch_mag_int, a_r_p5_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p5_m7, ch_mag_int, a_r_p5_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p3_p7, ch_mag_int, a_r_p3_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p3_p5, ch_mag_int, a_r_p3_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p3_p3, ch_mag_int, a_r_p3_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p3_p1, ch_mag_int, a_r_p3_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p3_m1, ch_mag_int, a_r_p3_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p3_m3, ch_mag_int, a_r_p3_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p3_m5, ch_mag_int, a_r_p3_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p3_m7, ch_mag_int, a_r_p3_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p1_p7, ch_mag_int, a_r_p1_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p1_p5, ch_mag_int, a_r_p1_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p1_p3, ch_mag_int, a_r_p1_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p1_p1, ch_mag_int, a_r_p1_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p1_m1, ch_mag_int, a_r_p1_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p1_m3, ch_mag_int, a_r_p1_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p1_m5, ch_mag_int, a_r_p1_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p1_m7, ch_mag_int, a_r_p1_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m1_p7, ch_mag_int, a_r_m1_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m1_p5, ch_mag_int, a_r_m1_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m1_p3, ch_mag_int, a_r_m1_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m1_p1, ch_mag_int, a_r_m1_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m1_m1, ch_mag_int, a_r_m1_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m1_m3, ch_mag_int, a_r_m1_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m1_m5, ch_mag_int, a_r_m1_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m1_m7, ch_mag_int, a_r_m1_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m3_p7, ch_mag_int, a_r_m3_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m3_p5, ch_mag_int, a_r_m3_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m3_p3, ch_mag_int, a_r_m3_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m3_p1, ch_mag_int, a_r_m3_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m3_m1, ch_mag_int, a_r_m3_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m3_m3, ch_mag_int, a_r_m3_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m3_m5, ch_mag_int, a_r_m3_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m3_m7, ch_mag_int, a_r_m3_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m5_p7, ch_mag_int, a_r_m5_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m5_p5, ch_mag_int, a_r_m5_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m5_p3, ch_mag_int, a_r_m5_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m5_p1, ch_mag_int, a_r_m5_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m5_m1, ch_mag_int, a_r_m5_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m5_m3, ch_mag_int, a_r_m5_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m5_m5, ch_mag_int, a_r_m5_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m5_m7, ch_mag_int, a_r_m5_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m7_p7, ch_mag_int, a_r_m7_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m7_p5, ch_mag_int, a_r_m7_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m7_p3, ch_mag_int, a_r_m7_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m7_p1, ch_mag_int, a_r_m7_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m7_m1, ch_mag_int, a_r_m7_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m7_m3, ch_mag_int, a_r_m7_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m7_m5, ch_mag_int, a_r_m7_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m7_m7, ch_mag_int, a_r_m7_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);

    interference_abs_epi16(psi_i_p7_p7, ch_mag_int, a_i_p7_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p7_p5, ch_mag_int, a_i_p7_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p7_p3, ch_mag_int, a_i_p7_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p7_p1, ch_mag_int, a_i_p7_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p7_m1, ch_mag_int, a_i_p7_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p7_m3, ch_mag_int, a_i_p7_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p7_m5, ch_mag_int, a_i_p7_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p7_m7, ch_mag_int, a_i_p7_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p5_p7, ch_mag_int, a_i_p5_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p5_p5, ch_mag_int, a_i_p5_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p5_p3, ch_mag_int, a_i_p5_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p5_p1, ch_mag_int, a_i_p5_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p5_m1, ch_mag_int, a_i_p5_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p5_m3, ch_mag_int, a_i_p5_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p5_m5, ch_mag_int, a_i_p5_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p5_m7, ch_mag_int, a_i_p5_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p3_p7, ch_mag_int, a_i_p3_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p3_p5, ch_mag_int, a_i_p3_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p3_p3, ch_mag_int, a_i_p3_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p3_p1, ch_mag_int, a_i_p3_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p3_m1, ch_mag_int, a_i_p3_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p3_m3, ch_mag_int, a_i_p3_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p3_m5, ch_mag_int, a_i_p3_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p3_m7, ch_mag_int, a_i_p3_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p1_p7, ch_mag_int, a_i_p1_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p1_p5, ch_mag_int, a_i_p1_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p1_p3, ch_mag_int, a_i_p1_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p1_p1, ch_mag_int, a_i_p1_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p1_m1, ch_mag_int, a_i_p1_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p1_m3, ch_mag_int, a_i_p1_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p1_m5, ch_mag_int, a_i_p1_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p1_m7, ch_mag_int, a_i_p1_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m1_p7, ch_mag_int, a_i_m1_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m1_p5, ch_mag_int, a_i_m1_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m1_p3, ch_mag_int, a_i_m1_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m1_p1, ch_mag_int, a_i_m1_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m1_m1, ch_mag_int, a_i_m1_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m1_m3, ch_mag_int, a_i_m1_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m1_m5, ch_mag_int, a_i_m1_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m1_m7, ch_mag_int, a_i_m1_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m3_p7, ch_mag_int, a_i_m3_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m3_p5, ch_mag_int, a_i_m3_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m3_p3, ch_mag_int, a_i_m3_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m3_p1, ch_mag_int, a_i_m3_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m3_m1, ch_mag_int, a_i_m3_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m3_m3, ch_mag_int, a_i_m3_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m3_m5, ch_mag_int, a_i_m3_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m3_m7, ch_mag_int, a_i_m3_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m5_p7, ch_mag_int, a_i_m5_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m5_p5, ch_mag_int, a_i_m5_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m5_p3, ch_mag_int, a_i_m5_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m5_p1, ch_mag_int, a_i_m5_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m5_m1, ch_mag_int, a_i_m5_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m5_m3, ch_mag_int, a_i_m5_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m5_m5, ch_mag_int, a_i_m5_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m5_m7, ch_mag_int, a_i_m5_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m7_p7, ch_mag_int, a_i_m7_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m7_p5, ch_mag_int, a_i_m7_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m7_p3, ch_mag_int, a_i_m7_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m7_p1, ch_mag_int, a_i_m7_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m7_m1, ch_mag_int, a_i_m7_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m7_m3, ch_mag_int, a_i_m7_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m7_m5, ch_mag_int, a_i_m7_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m7_m7, ch_mag_int, a_i_m7_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);

    // Calculation of a group of two terms in the bit metric involving product of psi and interference
    prodsum_psi_a_epi16(psi_r_p7_p7, a_r_p7_p7, psi_i_p7_p7, a_i_p7_p7, psi_a_p7_p7);
    prodsum_psi_a_epi16(psi_r_p7_p5, a_r_p7_p5, psi_i_p7_p5, a_i_p7_p5, psi_a_p7_p5);
    prodsum_psi_a_epi16(psi_r_p7_p3, a_r_p7_p3, psi_i_p7_p3, a_i_p7_p3, psi_a_p7_p3);
    prodsum_psi_a_epi16(psi_r_p7_p1, a_r_p7_p1, psi_i_p7_p1, a_i_p7_p1, psi_a_p7_p1);
    prodsum_psi_a_epi16(psi_r_p7_m1, a_r_p7_m1, psi_i_p7_m1, a_i_p7_m1, psi_a_p7_m1);
    prodsum_psi_a_epi16(psi_r_p7_m3, a_r_p7_m3, psi_i_p7_m3, a_i_p7_m3, psi_a_p7_m3);
    prodsum_psi_a_epi16(psi_r_p7_m5, a_r_p7_m5, psi_i_p7_m5, a_i_p7_m5, psi_a_p7_m5);
    prodsum_psi_a_epi16(psi_r_p7_m7, a_r_p7_m7, psi_i_p7_m7, a_i_p7_m7, psi_a_p7_m7);
    prodsum_psi_a_epi16(psi_r_p5_p7, a_r_p5_p7, psi_i_p5_p7, a_i_p5_p7, psi_a_p5_p7);
    prodsum_psi_a_epi16(psi_r_p5_p5, a_r_p5_p5, psi_i_p5_p5, a_i_p5_p5, psi_a_p5_p5);
    prodsum_psi_a_epi16(psi_r_p5_p3, a_r_p5_p3, psi_i_p5_p3, a_i_p5_p3, psi_a_p5_p3);
    prodsum_psi_a_epi16(psi_r_p5_p1, a_r_p5_p1, psi_i_p5_p1, a_i_p5_p1, psi_a_p5_p1);
    prodsum_psi_a_epi16(psi_r_p5_m1, a_r_p5_m1, psi_i_p5_m1, a_i_p5_m1, psi_a_p5_m1);
    prodsum_psi_a_epi16(psi_r_p5_m3, a_r_p5_m3, psi_i_p5_m3, a_i_p5_m3, psi_a_p5_m3);
    prodsum_psi_a_epi16(psi_r_p5_m5, a_r_p5_m5, psi_i_p5_m5, a_i_p5_m5, psi_a_p5_m5);
    prodsum_psi_a_epi16(psi_r_p5_m7, a_r_p5_m7, psi_i_p5_m7, a_i_p5_m7, psi_a_p5_m7);
    prodsum_psi_a_epi16(psi_r_p3_p7, a_r_p3_p7, psi_i_p3_p7, a_i_p3_p7, psi_a_p3_p7);
    prodsum_psi_a_epi16(psi_r_p3_p5, a_r_p3_p5, psi_i_p3_p5, a_i_p3_p5, psi_a_p3_p5);
    prodsum_psi_a_epi16(psi_r_p3_p3, a_r_p3_p3, psi_i_p3_p3, a_i_p3_p3, psi_a_p3_p3);
    prodsum_psi_a_epi16(psi_r_p3_p1, a_r_p3_p1, psi_i_p3_p1, a_i_p3_p1, psi_a_p3_p1);
    prodsum_psi_a_epi16(psi_r_p3_m1, a_r_p3_m1, psi_i_p3_m1, a_i_p3_m1, psi_a_p3_m1);
    prodsum_psi_a_epi16(psi_r_p3_m3, a_r_p3_m3, psi_i_p3_m3, a_i_p3_m3, psi_a_p3_m3);
    prodsum_psi_a_epi16(psi_r_p3_m5, a_r_p3_m5, psi_i_p3_m5, a_i_p3_m5, psi_a_p3_m5);
    prodsum_psi_a_epi16(psi_r_p3_m7, a_r_p3_m7, psi_i_p3_m7, a_i_p3_m7, psi_a_p3_m7);
    prodsum_psi_a_epi16(psi_r_p1_p7, a_r_p1_p7, psi_i_p1_p7, a_i_p1_p7, psi_a_p1_p7);
    prodsum_psi_a_epi16(psi_r_p1_p5, a_r_p1_p5, psi_i_p1_p5, a_i_p1_p5, psi_a_p1_p5);
    prodsum_psi_a_epi16(psi_r_p1_p3, a_r_p1_p3, psi_i_p1_p3, a_i_p1_p3, psi_a_p1_p3);
    prodsum_psi_a_epi16(psi_r_p1_p1, a_r_p1_p1, psi_i_p1_p1, a_i_p1_p1, psi_a_p1_p1);
    prodsum_psi_a_epi16(psi_r_p1_m1, a_r_p1_m1, psi_i_p1_m1, a_i_p1_m1, psi_a_p1_m1);
    prodsum_psi_a_epi16(psi_r_p1_m3, a_r_p1_m3, psi_i_p1_m3, a_i_p1_m3, psi_a_p1_m3);
    prodsum_psi_a_epi16(psi_r_p1_m5, a_r_p1_m5, psi_i_p1_m5, a_i_p1_m5, psi_a_p1_m5);
    prodsum_psi_a_epi16(psi_r_p1_m7, a_r_p1_m7, psi_i_p1_m7, a_i_p1_m7, psi_a_p1_m7);
    prodsum_psi_a_epi16(psi_r_m1_p7, a_r_m1_p7, psi_i_m1_p7, a_i_m1_p7, psi_a_m1_p7);
    prodsum_psi_a_epi16(psi_r_m1_p5, a_r_m1_p5, psi_i_m1_p5, a_i_m1_p5, psi_a_m1_p5);
    prodsum_psi_a_epi16(psi_r_m1_p3, a_r_m1_p3, psi_i_m1_p3, a_i_m1_p3, psi_a_m1_p3);
    prodsum_psi_a_epi16(psi_r_m1_p1, a_r_m1_p1, psi_i_m1_p1, a_i_m1_p1, psi_a_m1_p1);
    prodsum_psi_a_epi16(psi_r_m1_m1, a_r_m1_m1, psi_i_m1_m1, a_i_m1_m1, psi_a_m1_m1);
    prodsum_psi_a_epi16(psi_r_m1_m3, a_r_m1_m3, psi_i_m1_m3, a_i_m1_m3, psi_a_m1_m3);
    prodsum_psi_a_epi16(psi_r_m1_m5, a_r_m1_m5, psi_i_m1_m5, a_i_m1_m5, psi_a_m1_m5);
    prodsum_psi_a_epi16(psi_r_m1_m7, a_r_m1_m7, psi_i_m1_m7, a_i_m1_m7, psi_a_m1_m7);
    prodsum_psi_a_epi16(psi_r_m3_p7, a_r_m3_p7, psi_i_m3_p7, a_i_m3_p7, psi_a_m3_p7);
    prodsum_psi_a_epi16(psi_r_m3_p5, a_r_m3_p5, psi_i_m3_p5, a_i_m3_p5, psi_a_m3_p5);
    prodsum_psi_a_epi16(psi_r_m3_p3, a_r_m3_p3, psi_i_m3_p3, a_i_m3_p3, psi_a_m3_p3);
    prodsum_psi_a_epi16(psi_r_m3_p1, a_r_m3_p1, psi_i_m3_p1, a_i_m3_p1, psi_a_m3_p1);
    prodsum_psi_a_epi16(psi_r_m3_m1, a_r_m3_m1, psi_i_m3_m1, a_i_m3_m1, psi_a_m3_m1);
    prodsum_psi_a_epi16(psi_r_m3_m3, a_r_m3_m3, psi_i_m3_m3, a_i_m3_m3, psi_a_m3_m3);
    prodsum_psi_a_epi16(psi_r_m3_m5, a_r_m3_m5, psi_i_m3_m5, a_i_m3_m5, psi_a_m3_m5);
    prodsum_psi_a_epi16(psi_r_m3_m7, a_r_m3_m7, psi_i_m3_m7, a_i_m3_m7, psi_a_m3_m7);
    prodsum_psi_a_epi16(psi_r_m5_p7, a_r_m5_p7, psi_i_m5_p7, a_i_m5_p7, psi_a_m5_p7);
    prodsum_psi_a_epi16(psi_r_m5_p5, a_r_m5_p5, psi_i_m5_p5, a_i_m5_p5, psi_a_m5_p5);
    prodsum_psi_a_epi16(psi_r_m5_p3, a_r_m5_p3, psi_i_m5_p3, a_i_m5_p3, psi_a_m5_p3);
    prodsum_psi_a_epi16(psi_r_m5_p1, a_r_m5_p1, psi_i_m5_p1, a_i_m5_p1, psi_a_m5_p1);
    prodsum_psi_a_epi16(psi_r_m5_m1, a_r_m5_m1, psi_i_m5_m1, a_i_m5_m1, psi_a_m5_m1);
    prodsum_psi_a_epi16(psi_r_m5_m3, a_r_m5_m3, psi_i_m5_m3, a_i_m5_m3, psi_a_m5_m3);
    prodsum_psi_a_epi16(psi_r_m5_m5, a_r_m5_m5, psi_i_m5_m5, a_i_m5_m5, psi_a_m5_m5);
    prodsum_psi_a_epi16(psi_r_m5_m7, a_r_m5_m7, psi_i_m5_m7, a_i_m5_m7, psi_a_m5_m7);
    prodsum_psi_a_epi16(psi_r_m7_p7, a_r_m7_p7, psi_i_m7_p7, a_i_m7_p7, psi_a_m7_p7);
    prodsum_psi_a_epi16(psi_r_m7_p5, a_r_m7_p5, psi_i_m7_p5, a_i_m7_p5, psi_a_m7_p5);
    prodsum_psi_a_epi16(psi_r_m7_p3, a_r_m7_p3, psi_i_m7_p3, a_i_m7_p3, psi_a_m7_p3);
    prodsum_psi_a_epi16(psi_r_m7_p1, a_r_m7_p1, psi_i_m7_p1, a_i_m7_p1, psi_a_m7_p1);
    prodsum_psi_a_epi16(psi_r_m7_m1, a_r_m7_m1, psi_i_m7_m1, a_i_m7_m1, psi_a_m7_m1);
    prodsum_psi_a_epi16(psi_r_m7_m3, a_r_m7_m3, psi_i_m7_m3, a_i_m7_m3, psi_a_m7_m3);
    prodsum_psi_a_epi16(psi_r_m7_m5, a_r_m7_m5, psi_i_m7_m5, a_i_m7_m5, psi_a_m7_m5);
    prodsum_psi_a_epi16(psi_r_m7_m7, a_r_m7_m7, psi_i_m7_m7, a_i_m7_m7, psi_a_m7_m7);

    // Calculation of a group of two terms in the bit metric involving squares of interference
    square_a_epi16(a_r_p7_p7, a_i_p7_p7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p7_p7);
    square_a_epi16(a_r_p7_p5, a_i_p7_p5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p7_p5);
    square_a_epi16(a_r_p7_p3, a_i_p7_p3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p7_p3);
    square_a_epi16(a_r_p7_p1, a_i_p7_p1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p7_p1);
    square_a_epi16(a_r_p7_m1, a_i_p7_m1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p7_m1);
    square_a_epi16(a_r_p7_m3, a_i_p7_m3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p7_m3);
    square_a_epi16(a_r_p7_m5, a_i_p7_m5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p7_m5);
    square_a_epi16(a_r_p7_m7, a_i_p7_m7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p7_m7);
    square_a_epi16(a_r_p5_p7, a_i_p5_p7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p5_p7);
    square_a_epi16(a_r_p5_p5, a_i_p5_p5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p5_p5);
    square_a_epi16(a_r_p5_p3, a_i_p5_p3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p5_p3);
    square_a_epi16(a_r_p5_p1, a_i_p5_p1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p5_p1);
    square_a_epi16(a_r_p5_m1, a_i_p5_m1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p5_m1);
    square_a_epi16(a_r_p5_m3, a_i_p5_m3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p5_m3);
    square_a_epi16(a_r_p5_m5, a_i_p5_m5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p5_m5);
    square_a_epi16(a_r_p5_m7, a_i_p5_m7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p5_m7);
    square_a_epi16(a_r_p3_p7, a_i_p3_p7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p3_p7);
    square_a_epi16(a_r_p3_p5, a_i_p3_p5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p3_p5);
    square_a_epi16(a_r_p3_p3, a_i_p3_p3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p3_p3);
    square_a_epi16(a_r_p3_p1, a_i_p3_p1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p3_p1);
    square_a_epi16(a_r_p3_m1, a_i_p3_m1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p3_m1);
    square_a_epi16(a_r_p3_m3, a_i_p3_m3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p3_m3);
    square_a_epi16(a_r_p3_m5, a_i_p3_m5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p3_m5);
    square_a_epi16(a_r_p3_m7, a_i_p3_m7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p3_m7);
    square_a_epi16(a_r_p1_p7, a_i_p1_p7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p1_p7);
    square_a_epi16(a_r_p1_p5, a_i_p1_p5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p1_p5);
    square_a_epi16(a_r_p1_p3, a_i_p1_p3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p1_p3);
    square_a_epi16(a_r_p1_p1, a_i_p1_p1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p1_p1);
    square_a_epi16(a_r_p1_m1, a_i_p1_m1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p1_m1);
    square_a_epi16(a_r_p1_m3, a_i_p1_m3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p1_m3);
    square_a_epi16(a_r_p1_m5, a_i_p1_m5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p1_m5);
    square_a_epi16(a_r_p1_m7, a_i_p1_m7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p1_m7);
    square_a_epi16(a_r_m1_p7, a_i_m1_p7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m1_p7);
    square_a_epi16(a_r_m1_p5, a_i_m1_p5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m1_p5);
    square_a_epi16(a_r_m1_p3, a_i_m1_p3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m1_p3);
    square_a_epi16(a_r_m1_p1, a_i_m1_p1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m1_p1);
    square_a_epi16(a_r_m1_m1, a_i_m1_m1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m1_m1);
    square_a_epi16(a_r_m1_m3, a_i_m1_m3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m1_m3);
    square_a_epi16(a_r_m1_m5, a_i_m1_m5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m1_m5);
    square_a_epi16(a_r_m1_m7, a_i_m1_m7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m1_m7);
    square_a_epi16(a_r_m3_p7, a_i_m3_p7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m3_p7);
    square_a_epi16(a_r_m3_p5, a_i_m3_p5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m3_p5);
    square_a_epi16(a_r_m3_p3, a_i_m3_p3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m3_p3);
    square_a_epi16(a_r_m3_p1, a_i_m3_p1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m3_p1);
    square_a_epi16(a_r_m3_m1, a_i_m3_m1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m3_m1);
    square_a_epi16(a_r_m3_m3, a_i_m3_m3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m3_m3);
    square_a_epi16(a_r_m3_m5, a_i_m3_m5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m3_m5);
    square_a_epi16(a_r_m3_m7, a_i_m3_m7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m3_m7);
    square_a_epi16(a_r_m5_p7, a_i_m5_p7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m5_p7);
    square_a_epi16(a_r_m5_p5, a_i_m5_p5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m5_p5);
    square_a_epi16(a_r_m5_p3, a_i_m5_p3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m5_p3);
    square_a_epi16(a_r_m5_p1, a_i_m5_p1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m5_p1);
    square_a_epi16(a_r_m5_m1, a_i_m5_m1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m5_m1);
    square_a_epi16(a_r_m5_m3, a_i_m5_m3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m5_m3);
    square_a_epi16(a_r_m5_m5, a_i_m5_m5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m5_m5);
    square_a_epi16(a_r_m5_m7, a_i_m5_m7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m5_m7);
    square_a_epi16(a_r_m7_p7, a_i_m7_p7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m7_p7);
    square_a_epi16(a_r_m7_p5, a_i_m7_p5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m7_p5);
    square_a_epi16(a_r_m7_p3, a_i_m7_p3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m7_p3);
    square_a_epi16(a_r_m7_p1, a_i_m7_p1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m7_p1);
    square_a_epi16(a_r_m7_m1, a_i_m7_m1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m7_m1);
    square_a_epi16(a_r_m7_m3, a_i_m7_m3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m7_m3);
    square_a_epi16(a_r_m7_m5, a_i_m7_m5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m7_m5);
    square_a_epi16(a_r_m7_m7, a_i_m7_m7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m7_m7);

    // Computing different multiples of ||h0||^2
    // x=1, y=1
    ch_mag_2_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,ONE_OVER_FOUR_SQRT_42);
    ch_mag_2_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_2_over_42_with_sigma2,1);
    // x=1, y=3
    ch_mag_10_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,FIVE_OVER_FOUR_SQRT_42);
    ch_mag_10_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_10_over_42_with_sigma2,1);
    // x=1, x=5
    ch_mag_26_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,THIRTEEN_OVER_FOUR_SQRT_42);
    ch_mag_26_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_26_over_42_with_sigma2,1);
    // x=1, y=7
    ch_mag_50_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,TWENTYFIVE_OVER_FOUR_SQRT_42);
    ch_mag_50_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_50_over_42_with_sigma2,1);
    // x=3, y=3
    ch_mag_18_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,NINE_OVER_FOUR_SQRT_42);
    ch_mag_18_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_18_over_42_with_sigma2,1);
    // x=3, y=5
    ch_mag_34_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,SEVENTEEN_OVER_FOUR_SQRT_42);
    ch_mag_34_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_34_over_42_with_sigma2,1);
    // x=3, y=7
    ch_mag_58_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,TWENTYNINE_OVER_FOUR_SQRT_42);
    ch_mag_58_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_58_over_42_with_sigma2,2);
    // x=5, y=5
    ch_mag_50_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,TWENTYFIVE_OVER_FOUR_SQRT_42);
    ch_mag_50_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_50_over_42_with_sigma2,1);
    // x=5, y=7
    ch_mag_74_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,THIRTYSEVEN_OVER_FOUR_SQRT_42);
    ch_mag_74_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_74_over_42_with_sigma2,2);
    // x=7, y=7
    ch_mag_98_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,FORTYNINE_OVER_FOUR_SQRT_42);
    ch_mag_98_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_98_over_42_with_sigma2,2);

    // Computing Metrics
    xmm0 = _mm256_subs_epi16(psi_a_p7_p7, a_sq_p7_p7);
    xmm1 = _mm256_adds_epi16(xmm0, y0_p_7_7);
    bit_met_p7_p7 = _mm256_subs_epi16(xmm1, ch_mag_98_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p7_p5, a_sq_p7_p5);
    xmm1 = _mm256_adds_epi16(xmm0, y0_p_7_5);
    bit_met_p7_p5 = _mm256_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p7_p3, a_sq_p7_p3);
    xmm1 = _mm256_adds_epi16(xmm0, y0_p_7_3);
    bit_met_p7_p3 = _mm256_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p7_p1, a_sq_p7_p1);
    xmm1 = _mm256_adds_epi16(xmm0, y0_p_7_1);
    bit_met_p7_p1 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p7_m1, a_sq_p7_m1);
    xmm1 = _mm256_adds_epi16(xmm0, y0_m_7_1);
    bit_met_p7_m1 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p7_m3, a_sq_p7_m3);
    xmm1 = _mm256_adds_epi16(xmm0, y0_m_7_3);
    bit_met_p7_m3 = _mm256_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p7_m5, a_sq_p7_m5);
    xmm1 = _mm256_adds_epi16(xmm0, y0_m_7_5);
    bit_met_p7_m5 = _mm256_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p7_m7, a_sq_p7_m7);
    xmm1 = _mm256_adds_epi16(xmm0, y0_m_7_7);
    bit_met_p7_m7 = _mm256_subs_epi16(xmm1, ch_mag_98_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p5_p7, a_sq_p5_p7);
    xmm1 = _mm256_adds_epi16(xmm0, y0_p_5_7);
    bit_met_p5_p7 = _mm256_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p5_p5, a_sq_p5_p5);
    xmm1 = _mm256_adds_epi16(xmm0, y0_p_5_5);
    bit_met_p5_p5 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p5_p3, a_sq_p5_p3);
    xmm1 = _mm256_adds_epi16(xmm0, y0_p_5_3);
    bit_met_p5_p3 = _mm256_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p5_p1, a_sq_p5_p1);
    xmm1 = _mm256_adds_epi16(xmm0, y0_p_5_1);
    bit_met_p5_p1 = _mm256_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p5_m1, a_sq_p5_m1);
    xmm1 = _mm256_adds_epi16(xmm0, y0_m_5_1);
    bit_met_p5_m1 = _mm256_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p5_m3, a_sq_p5_m3);
    xmm1 = _mm256_adds_epi16(xmm0, y0_m_5_3);
    bit_met_p5_m3 = _mm256_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p5_m5, a_sq_p5_m5);
    xmm1 = _mm256_adds_epi16(xmm0, y0_m_5_5);
    bit_met_p5_m5 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p5_m7, a_sq_p5_m7);
    xmm1 = _mm256_adds_epi16(xmm0, y0_m_5_7);
    bit_met_p5_m7 = _mm256_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p3_p7, a_sq_p3_p7);
    xmm1 = _mm256_adds_epi16(xmm0, y0_p_3_7);
    bit_met_p3_p7 = _mm256_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p3_p5, a_sq_p3_p5);
    xmm1 = _mm256_adds_epi16(xmm0, y0_p_3_5);
    bit_met_p3_p5 = _mm256_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p3_p3, a_sq_p3_p3);
    xmm1 = _mm256_adds_epi16(xmm0, y0_p_3_3);
    bit_met_p3_p3 = _mm256_subs_epi16(xmm1, ch_mag_18_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p3_p1, a_sq_p3_p1);
    xmm1 = _mm256_adds_epi16(xmm0, y0_p_3_1);
    bit_met_p3_p1 = _mm256_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p3_m1, a_sq_p3_m1);
    xmm1 = _mm256_adds_epi16(xmm0, y0_m_3_1);
    bit_met_p3_m1 = _mm256_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p3_m3, a_sq_p3_m3);
    xmm1 = _mm256_adds_epi16(xmm0, y0_m_3_3);
    bit_met_p3_m3 = _mm256_subs_epi16(xmm1, ch_mag_18_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p3_m5, a_sq_p3_m5);
    xmm1 = _mm256_adds_epi16(xmm0, y0_m_3_5);
    bit_met_p3_m5 = _mm256_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p3_m7, a_sq_p3_m7);
    xmm1 = _mm256_adds_epi16(xmm0, y0_m_3_7);
    bit_met_p3_m7 = _mm256_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p1_p7, a_sq_p1_p7);
    xmm1 = _mm256_adds_epi16(xmm0, y0_p_1_7);
    bit_met_p1_p7 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p1_p5, a_sq_p1_p5);
    xmm1 = _mm256_adds_epi16(xmm0, y0_p_1_5);
    bit_met_p1_p5 = _mm256_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p1_p3, a_sq_p1_p3);
    xmm1 = _mm256_adds_epi16(xmm0, y0_p_1_3);
    bit_met_p1_p3 = _mm256_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p1_p1, a_sq_p1_p1);
    xmm1 = _mm256_adds_epi16(xmm0, y0_p_1_1);
    bit_met_p1_p1 = _mm256_subs_epi16(xmm1, ch_mag_2_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p1_m1, a_sq_p1_m1);
    xmm1 = _mm256_adds_epi16(xmm0, y0_m_1_1);
    bit_met_p1_m1 = _mm256_subs_epi16(xmm1, ch_mag_2_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p1_m3, a_sq_p1_m3);
    xmm1 = _mm256_adds_epi16(xmm0, y0_m_1_3);
    bit_met_p1_m3 = _mm256_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p1_m5, a_sq_p1_m5);
    xmm1 = _mm256_adds_epi16(xmm0, y0_m_1_5);
    bit_met_p1_m5 = _mm256_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p1_m7, a_sq_p1_m7);
    xmm1 = _mm256_adds_epi16(xmm0, y0_m_1_7);
    bit_met_p1_m7 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);

    xmm0 = _mm256_subs_epi16(psi_a_m1_p7, a_sq_m1_p7);
    xmm1 = _mm256_subs_epi16(xmm0, y0_m_1_7);
    bit_met_m1_p7 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m1_p5, a_sq_m1_p5);
    xmm1 = _mm256_subs_epi16(xmm0, y0_m_1_5);
    bit_met_m1_p5 = _mm256_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m1_p3, a_sq_m1_p3);
    xmm1 = _mm256_subs_epi16(xmm0, y0_m_1_3);
    bit_met_m1_p3 = _mm256_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m1_p1, a_sq_m1_p1);
    xmm1 = _mm256_subs_epi16(xmm0, y0_m_1_1);
    bit_met_m1_p1 = _mm256_subs_epi16(xmm1, ch_mag_2_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m1_m1, a_sq_m1_m1);
    xmm1 = _mm256_subs_epi16(xmm0, y0_p_1_1);
    bit_met_m1_m1 = _mm256_subs_epi16(xmm1, ch_mag_2_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m1_m3, a_sq_m1_m3);
    xmm1 = _mm256_subs_epi16(xmm0, y0_p_1_3);
    bit_met_m1_m3 = _mm256_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m1_m5, a_sq_m1_m5);
    xmm1 = _mm256_subs_epi16(xmm0, y0_p_1_5);
    bit_met_m1_m5 = _mm256_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m1_m7, a_sq_m1_m7);
    xmm1 = _mm256_subs_epi16(xmm0, y0_p_1_7);
    bit_met_m1_m7 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m3_p7, a_sq_m3_p7);
    xmm1 = _mm256_subs_epi16(xmm0, y0_m_3_7);
    bit_met_m3_p7 = _mm256_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m3_p5, a_sq_m3_p5);
    xmm1 = _mm256_subs_epi16(xmm0, y0_m_3_5);
    bit_met_m3_p5 = _mm256_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m3_p3, a_sq_m3_p3);
    xmm1 = _mm256_subs_epi16(xmm0, y0_m_3_3);
    bit_met_m3_p3 = _mm256_subs_epi16(xmm1, ch_mag_18_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m3_p1, a_sq_m3_p1);
    xmm1 = _mm256_subs_epi16(xmm0, y0_m_3_1);
    bit_met_m3_p1 = _mm256_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m3_m1, a_sq_m3_m1);
    xmm1 = _mm256_subs_epi16(xmm0, y0_p_3_1);
    bit_met_m3_m1 = _mm256_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m3_m3, a_sq_m3_m3);
    xmm1 = _mm256_subs_epi16(xmm0, y0_p_3_3);
    bit_met_m3_m3 = _mm256_subs_epi16(xmm1, ch_mag_18_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m3_m5, a_sq_m3_m5);
    xmm1 = _mm256_subs_epi16(xmm0, y0_p_3_5);
    bit_met_m3_m5 = _mm256_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m3_m7, a_sq_m3_m7);
    xmm1 = _mm256_subs_epi16(xmm0, y0_p_3_7);
    bit_met_m3_m7 = _mm256_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m5_p7, a_sq_m5_p7);
    xmm1 = _mm256_subs_epi16(xmm0, y0_m_5_7);
    bit_met_m5_p7 = _mm256_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m5_p5, a_sq_m5_p5);
    xmm1 = _mm256_subs_epi16(xmm0, y0_m_5_5);
    bit_met_m5_p5 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m5_p3, a_sq_m5_p3);
    xmm1 = _mm256_subs_epi16(xmm0, y0_m_5_3);
    bit_met_m5_p3 = _mm256_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m5_p1, a_sq_m5_p1);
    xmm1 = _mm256_subs_epi16(xmm0, y0_m_5_1);
    bit_met_m5_p1 = _mm256_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m5_m1, a_sq_m5_m1);
    xmm1 = _mm256_subs_epi16(xmm0, y0_p_5_1);
    bit_met_m5_m1 = _mm256_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m5_m3, a_sq_m5_m3);
    xmm1 = _mm256_subs_epi16(xmm0, y0_p_5_3);
    bit_met_m5_m3 = _mm256_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m5_m5, a_sq_m5_m5);
    xmm1 = _mm256_subs_epi16(xmm0, y0_p_5_5);
    bit_met_m5_m5 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m5_m7, a_sq_m5_m7);
    xmm1 = _mm256_subs_epi16(xmm0, y0_p_5_7);
    bit_met_m5_m7 = _mm256_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m7_p7, a_sq_m7_p7);
    xmm1 = _mm256_subs_epi16(xmm0, y0_m_7_7);
    bit_met_m7_p7 = _mm256_subs_epi16(xmm1, ch_mag_98_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m7_p5, a_sq_m7_p5);
    xmm1 = _mm256_subs_epi16(xmm0, y0_m_7_5);
    bit_met_m7_p5 = _mm256_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m7_p3, a_sq_m7_p3);
    xmm1 = _mm256_subs_epi16(xmm0, y0_m_7_3);
    bit_met_m7_p3 = _mm256_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m7_p1, a_sq_m7_p1);
    xmm1 = _mm256_subs_epi16(xmm0, y0_m_7_1);
    bit_met_m7_p1 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m7_m1, a_sq_m7_m1);
    xmm1 = _mm256_subs_epi16(xmm0, y0_p_7_1);
    bit_met_m7_m1 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m7_m3, a_sq_m7_m3);
    xmm1 = _mm256_subs_epi16(xmm0, y0_p_7_3);
    bit_met_m7_m3 = _mm256_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m7_m5, a_sq_m7_m5);
    xmm1 = _mm256_subs_epi16(xmm0, y0_p_7_5);
    bit_met_m7_m5 = _mm256_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m7_m7, a_sq_m7_m7);
    xmm1 = _mm256_subs_epi16(xmm0, y0_p_7_7);
    bit_met_m7_m7 = _mm256_subs_epi16(xmm1, ch_mag_98_over_42_with_sigma2);

    // Detection for 1st bit (LTE mapping)
    // bit = 1
    xmm0 = _mm256_max_epi16(bit_met_m7_p7, bit_met_m7_p5);
    xmm1 = _mm256_max_epi16(bit_met_m7_p3, bit_met_m7_p1);
    xmm2 = _mm256_max_epi16(bit_met_m7_m1, bit_met_m7_m3);
    xmm3 = _mm256_max_epi16(bit_met_m7_m5, bit_met_m7_m7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(xmm4, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_m5_p7, bit_met_m5_p5);
    xmm1 = _mm256_max_epi16(bit_met_m5_p3, bit_met_m5_p1);
    xmm2 = _mm256_max_epi16(bit_met_m5_m1, bit_met_m5_m3);
    xmm3 = _mm256_max_epi16(bit_met_m5_m5, bit_met_m5_m7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_m3_p7, bit_met_m3_p5);
    xmm1 = _mm256_max_epi16(bit_met_m3_p3, bit_met_m3_p1);
    xmm2 = _mm256_max_epi16(bit_met_m3_m1, bit_met_m3_m3);
    xmm3 = _mm256_max_epi16(bit_met_m3_m5, bit_met_m3_m7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_m1_p7, bit_met_m1_p5);
    xmm1 = _mm256_max_epi16(bit_met_m1_p3, bit_met_m1_p1);
    xmm2 = _mm256_max_epi16(bit_met_m1_m1, bit_met_m1_m3);
    xmm3 = _mm256_max_epi16(bit_met_m1_m5, bit_met_m1_m7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);

    // bit = 0
    xmm0 = _mm256_max_epi16(bit_met_p7_p7, bit_met_p7_p5);
    xmm1 = _mm256_max_epi16(bit_met_p7_p3, bit_met_p7_p1);
    xmm2 = _mm256_max_epi16(bit_met_p7_m1, bit_met_p7_m3);
    xmm3 = _mm256_max_epi16(bit_met_p7_m5, bit_met_p7_m7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(xmm4, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p5_p7, bit_met_p5_p5);
    xmm1 = _mm256_max_epi16(bit_met_p5_p3, bit_met_p5_p1);
    xmm2 = _mm256_max_epi16(bit_met_p5_m1, bit_met_p5_m3);
    xmm3 = _mm256_max_epi16(bit_met_p5_m5, bit_met_p5_m7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p3_p7, bit_met_p3_p5);
    xmm1 = _mm256_max_epi16(bit_met_p3_p3, bit_met_p3_p1);
    xmm2 = _mm256_max_epi16(bit_met_p3_m1, bit_met_p3_m3);
    xmm3 = _mm256_max_epi16(bit_met_p3_m5, bit_met_p3_m7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p1_p7, bit_met_p1_p5);
    xmm1 = _mm256_max_epi16(bit_met_p1_p3, bit_met_p1_p1);
    xmm2 = _mm256_max_epi16(bit_met_p1_m1, bit_met_p1_m3);
    xmm3 = _mm256_max_epi16(bit_met_p1_m5, bit_met_p1_m7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);

    y0r = _mm256_subs_epi16(logmax_num_re0, logmax_den_re0);

    // Detection for 2nd bit (LTE mapping)
    // bit = 1
    xmm0 = _mm256_max_epi16(bit_met_p7_m1, bit_met_p5_m1);
    xmm1 = _mm256_max_epi16(bit_met_p3_m1, bit_met_p1_m1);
    xmm2 = _mm256_max_epi16(bit_met_m1_m1, bit_met_m3_m1);
    xmm3 = _mm256_max_epi16(bit_met_m5_m1, bit_met_m7_m1);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(xmm4, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_m3, bit_met_p5_m3);
    xmm1 = _mm256_max_epi16(bit_met_p3_m3, bit_met_p1_m3);
    xmm2 = _mm256_max_epi16(bit_met_m1_m3, bit_met_m3_m3);
    xmm3 = _mm256_max_epi16(bit_met_m5_m3, bit_met_m7_m3);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_m5, bit_met_p5_m5);
    xmm1 = _mm256_max_epi16(bit_met_p3_m5, bit_met_p1_m5);
    xmm2 = _mm256_max_epi16(bit_met_m1_m5, bit_met_m3_m5);
    xmm3 = _mm256_max_epi16(bit_met_m5_m5, bit_met_m7_m5);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_m7, bit_met_p5_m7);
    xmm1 = _mm256_max_epi16(bit_met_p3_m7, bit_met_p1_m7);
    xmm2 = _mm256_max_epi16(bit_met_m1_m7, bit_met_m3_m7);
    xmm3 = _mm256_max_epi16(bit_met_m5_m7, bit_met_m7_m7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);

    // bit = 0
    xmm0 = _mm256_max_epi16(bit_met_p7_p1, bit_met_p5_p1);
    xmm1 = _mm256_max_epi16(bit_met_p3_p1, bit_met_p1_p1);
    xmm2 = _mm256_max_epi16(bit_met_m1_p1, bit_met_m3_p1);
    xmm3 = _mm256_max_epi16(bit_met_m5_p1, bit_met_m7_p1);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(xmm4, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_p3, bit_met_p5_p3);
    xmm1 = _mm256_max_epi16(bit_met_p3_p3, bit_met_p1_p3);
    xmm2 = _mm256_max_epi16(bit_met_m1_p3, bit_met_m3_p3);
    xmm3 = _mm256_max_epi16(bit_met_m5_p3, bit_met_m7_p3);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_p5, bit_met_p5_p5);
    xmm1 = _mm256_max_epi16(bit_met_p3_p5, bit_met_p1_p5);
    xmm2 = _mm256_max_epi16(bit_met_m1_p5, bit_met_m3_p5);
    xmm3 = _mm256_max_epi16(bit_met_m5_p5, bit_met_m7_p5);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_p7, bit_met_p5_p7);
    xmm1 = _mm256_max_epi16(bit_met_p3_p7, bit_met_p1_p7);
    xmm2 = _mm256_max_epi16(bit_met_m1_p7, bit_met_m3_p7);
    xmm3 = _mm256_max_epi16(bit_met_m5_p7, bit_met_m7_p7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);

    y1r = _mm256_subs_epi16(logmax_num_re0, logmax_den_re0);

    // Detection for 3rd bit (LTE mapping)
    xmm0 = _mm256_max_epi16(bit_met_m7_m7, bit_met_m7_m5);
    xmm1 = _mm256_max_epi16(bit_met_m7_m3, bit_met_m7_m1);
    xmm2 = _mm256_max_epi16(bit_met_m7_p1, bit_met_m7_p3);
    xmm3 = _mm256_max_epi16(bit_met_m7_p5, bit_met_m7_p7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(xmm4, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_m5_m7, bit_met_m5_m5);
    xmm1 = _mm256_max_epi16(bit_met_m5_m3, bit_met_m5_m1);
    xmm2 = _mm256_max_epi16(bit_met_m5_p1, bit_met_m5_p3);
    xmm3 = _mm256_max_epi16(bit_met_m5_p5, bit_met_m5_p7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p5_m7, bit_met_p5_m5);
    xmm1 = _mm256_max_epi16(bit_met_p5_m3, bit_met_p5_m1);
    xmm2 = _mm256_max_epi16(bit_met_p5_p1, bit_met_p5_p3);
    xmm3 = _mm256_max_epi16(bit_met_p5_p5, bit_met_p5_p7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_m7, bit_met_p7_m5);
    xmm1 = _mm256_max_epi16(bit_met_p7_m3, bit_met_p7_m1);
    xmm2 = _mm256_max_epi16(bit_met_p7_p1, bit_met_p7_p3);
    xmm3 = _mm256_max_epi16(bit_met_p7_p5, bit_met_p7_p7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);

    xmm0 = _mm256_max_epi16(bit_met_m3_m7, bit_met_m3_m5);
    xmm1 = _mm256_max_epi16(bit_met_m3_m3, bit_met_m3_m1);
    xmm2 = _mm256_max_epi16(bit_met_m3_p1, bit_met_m3_p3);
    xmm3 = _mm256_max_epi16(bit_met_m3_p5, bit_met_m3_p7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(xmm4, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_m1_m7, bit_met_m1_m5);
    xmm1 = _mm256_max_epi16(bit_met_m1_m3, bit_met_m1_m1);
    xmm2 = _mm256_max_epi16(bit_met_m1_p1, bit_met_m1_p3);
    xmm3 = _mm256_max_epi16(bit_met_m1_p5, bit_met_m1_p7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p1_m7, bit_met_p1_m5);
    xmm1 = _mm256_max_epi16(bit_met_p1_m3, bit_met_p1_m1);
    xmm2 = _mm256_max_epi16(bit_met_p1_p1, bit_met_p1_p3);
    xmm3 = _mm256_max_epi16(bit_met_p1_p5, bit_met_p1_p7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p3_m7, bit_met_p3_m5);
    xmm1 = _mm256_max_epi16(bit_met_p3_m3, bit_met_p3_m1);
    xmm2 = _mm256_max_epi16(bit_met_p3_p1, bit_met_p3_p3);
    xmm3 = _mm256_max_epi16(bit_met_p3_p5, bit_met_p3_p7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);

    y2r = _mm256_subs_epi16(logmax_num_re0, logmax_den_re0);

    // Detection for 4th bit (LTE mapping)
    xmm0 = _mm256_max_epi16(bit_met_p7_p7, bit_met_p5_p7);
    xmm1 = _mm256_max_epi16(bit_met_p3_p7, bit_met_p1_p7);
    xmm2 = _mm256_max_epi16(bit_met_m1_p7, bit_met_m3_p7);
    xmm3 = _mm256_max_epi16(bit_met_m5_p7, bit_met_m7_p7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(xmm4, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_p5, bit_met_p5_p5);
    xmm1 = _mm256_max_epi16(bit_met_p3_p5, bit_met_p1_p5);
    xmm2 = _mm256_max_epi16(bit_met_m1_p5, bit_met_m3_p5);
    xmm3 = _mm256_max_epi16(bit_met_m5_p5, bit_met_m5_p5);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_m5, bit_met_p5_m5);
    xmm1 = _mm256_max_epi16(bit_met_p3_m5, bit_met_p1_m5);
    xmm2 = _mm256_max_epi16(bit_met_m1_m5, bit_met_m3_m5);
    xmm3 = _mm256_max_epi16(bit_met_m5_m5, bit_met_m7_m5);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_m7, bit_met_p5_m7);
    xmm1 = _mm256_max_epi16(bit_met_p3_m7, bit_met_p1_m7);
    xmm2 = _mm256_max_epi16(bit_met_m1_m7, bit_met_m3_m7);
    xmm3 = _mm256_max_epi16(bit_met_m5_m7, bit_met_m7_m7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);

    xmm0 = _mm256_max_epi16(bit_met_p7_m1, bit_met_p5_m1);
    xmm1 = _mm256_max_epi16(bit_met_p3_m1, bit_met_p1_m1);
    xmm2 = _mm256_max_epi16(bit_met_m1_m1, bit_met_m3_m1);
    xmm3 = _mm256_max_epi16(bit_met_m5_m1, bit_met_m7_m1);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(xmm4, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_m3, bit_met_p5_m3);
    xmm1 = _mm256_max_epi16(bit_met_p3_m3, bit_met_p1_m3);
    xmm2 = _mm256_max_epi16(bit_met_m1_m3, bit_met_m3_m3);
    xmm3 = _mm256_max_epi16(bit_met_m5_m3, bit_met_m7_m3);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_p1, bit_met_p5_p1);
    xmm1 = _mm256_max_epi16(bit_met_p3_p1, bit_met_p1_p1);
    xmm2 = _mm256_max_epi16(bit_met_m1_p1, bit_met_m3_p1);
    xmm3 = _mm256_max_epi16(bit_met_m5_p1, bit_met_m7_p1);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_p3, bit_met_p5_p3);
    xmm1 = _mm256_max_epi16(bit_met_p3_p3, bit_met_p1_p3);
    xmm2 = _mm256_max_epi16(bit_met_m1_p3, bit_met_m3_p3);
    xmm3 = _mm256_max_epi16(bit_met_m5_p3, bit_met_m7_p3);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);

    y0i = _mm256_subs_epi16(logmax_num_re0, logmax_den_re0);


    // Detection for 5th bit (LTE mapping)
    xmm0 = _mm256_max_epi16(bit_met_m7_m7, bit_met_m7_m5);
    xmm1 = _mm256_max_epi16(bit_met_m7_m3, bit_met_m7_m1);
    xmm2 = _mm256_max_epi16(bit_met_m7_p1, bit_met_m7_p3);
    xmm3 = _mm256_max_epi16(bit_met_m7_p5, bit_met_m7_p7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(xmm4, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_m1_m7, bit_met_m1_m5);
    xmm1 = _mm256_max_epi16(bit_met_m1_m3, bit_met_m1_m1);
    xmm2 = _mm256_max_epi16(bit_met_m1_p1, bit_met_m1_p3);
    xmm3 = _mm256_max_epi16(bit_met_m1_p5, bit_met_m1_p7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p1_m7, bit_met_p1_m5);
    xmm1 = _mm256_max_epi16(bit_met_p1_m3, bit_met_p1_m1);
    xmm2 = _mm256_max_epi16(bit_met_p1_p1, bit_met_p1_p3);
    xmm3 = _mm256_max_epi16(bit_met_p1_p5, bit_met_p1_p7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_m7, bit_met_p7_m5);
    xmm1 = _mm256_max_epi16(bit_met_p7_m3, bit_met_p7_m1);
    xmm2 = _mm256_max_epi16(bit_met_p7_p1, bit_met_p7_p3);
    xmm3 = _mm256_max_epi16(bit_met_p7_p5, bit_met_p7_p7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);

    xmm0 = _mm256_max_epi16(bit_met_m5_m7, bit_met_m5_m5);
    xmm1 = _mm256_max_epi16(bit_met_m5_m3, bit_met_m5_m1);
    xmm2 = _mm256_max_epi16(bit_met_m5_p1, bit_met_m5_p3);
    xmm3 = _mm256_max_epi16(bit_met_m5_p5, bit_met_m5_p7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(xmm4, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_m3_m7, bit_met_m3_m5);
    xmm1 = _mm256_max_epi16(bit_met_m3_m3, bit_met_m3_m1);
    xmm2 = _mm256_max_epi16(bit_met_m3_p1, bit_met_m3_p3);
    xmm3 = _mm256_max_epi16(bit_met_m3_p5, bit_met_m3_p7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p3_m7, bit_met_p3_m5);
    xmm1 = _mm256_max_epi16(bit_met_p3_m3, bit_met_p3_m1);
    xmm2 = _mm256_max_epi16(bit_met_p3_p1, bit_met_p3_p3);
    xmm3 = _mm256_max_epi16(bit_met_p3_p5, bit_met_p3_p7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p5_m7, bit_met_p5_m5);
    xmm1 = _mm256_max_epi16(bit_met_p5_m3, bit_met_p5_m1);
    xmm2 = _mm256_max_epi16(bit_met_p5_p1, bit_met_p5_p3);
    xmm3 = _mm256_max_epi16(bit_met_p5_p5, bit_met_p5_p7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);

    y1i = _mm256_subs_epi16(logmax_num_re0, logmax_den_re0);

    // Detection for 6th bit (LTE mapping)
    xmm0 = _mm256_max_epi16(bit_met_p7_p7, bit_met_p5_p7);
    xmm1 = _mm256_max_epi16(bit_met_p3_p7, bit_met_p1_p7);
    xmm2 = _mm256_max_epi16(bit_met_m1_p7, bit_met_m3_p7);
    xmm3 = _mm256_max_epi16(bit_met_m5_p7, bit_met_m7_p7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(xmm4, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_p1, bit_met_p5_p1);
    xmm1 = _mm256_max_epi16(bit_met_p3_p1, bit_met_p1_p1);
    xmm2 = _mm256_max_epi16(bit_met_m1_p1, bit_met_m3_p1);
    xmm3 = _mm256_max_epi16(bit_met_m5_p1, bit_met_m5_p1);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_m1, bit_met_p5_m1);
    xmm1 = _mm256_max_epi16(bit_met_p3_m1, bit_met_p1_m1);
    xmm2 = _mm256_max_epi16(bit_met_m1_m1, bit_met_m3_m1);
    xmm3 = _mm256_max_epi16(bit_met_m5_m1, bit_met_m7_m1);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_m7, bit_met_p5_m7);
    xmm1 = _mm256_max_epi16(bit_met_p3_m7, bit_met_p1_m7);
    xmm2 = _mm256_max_epi16(bit_met_m1_m7, bit_met_m3_m7);
    xmm3 = _mm256_max_epi16(bit_met_m5_m7, bit_met_m7_m7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);

    xmm0 = _mm256_max_epi16(bit_met_p7_m5, bit_met_p5_m5);
    xmm1 = _mm256_max_epi16(bit_met_p3_m5, bit_met_p1_m5);
    xmm2 = _mm256_max_epi16(bit_met_m1_m5, bit_met_m3_m5);
    xmm3 = _mm256_max_epi16(bit_met_m5_m5, bit_met_m7_m5);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(xmm4, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_m3, bit_met_p5_m3);
    xmm1 = _mm256_max_epi16(bit_met_p3_m3, bit_met_p1_m3);
    xmm2 = _mm256_max_epi16(bit_met_m1_m3, bit_met_m3_m3);
    xmm3 = _mm256_max_epi16(bit_met_m5_m3, bit_met_m7_m3);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_p3, bit_met_p5_p3);
    xmm1 = _mm256_max_epi16(bit_met_p3_p3, bit_met_p1_p3);
    xmm2 = _mm256_max_epi16(bit_met_m1_p3, bit_met_m3_p3);
    xmm3 = _mm256_max_epi16(bit_met_m5_p3, bit_met_m7_p3);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_p5, bit_met_p5_p5);
    xmm1 = _mm256_max_epi16(bit_met_p3_p5, bit_met_p1_p5);
    xmm2 = _mm256_max_epi16(bit_met_m1_p5, bit_met_m3_p5);
    xmm3 = _mm256_max_epi16(bit_met_m5_p5, bit_met_m7_p5);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);

    y2i = _mm256_subs_epi16(logmax_num_re0, logmax_den_re0);

    // map to output stream, difficult to do in SIMD since we have 6 16bit LLRs
    // RE 1
    j = 48*i;
    stream0_out[j + 0] = ((short *)&y0r)[0];
    stream0_out[j + 1] = ((short *)&y1r)[0];
    stream0_out[j + 2] = ((short *)&y2r)[0];
    stream0_out[j + 3] = ((short *)&y0i)[0];
    stream0_out[j + 4] = ((short *)&y1i)[0];
    stream0_out[j + 5] = ((short *)&y2i)[0];
    // RE 2
    stream0_out[j + 6] = ((short *)&y0r)[1];
    stream0_out[j + 7] = ((short *)&y1r)[1];
    stream0_out[j + 8] = ((short *)&y2r)[1];
    stream0_out[j + 9] = ((short *)&y0i)[1];
    stream0_out[j + 10] = ((short *)&y1i)[1];
    stream0_out[j + 11] = ((short *)&y2i)[1];
    // RE 3
    stream0_out[j + 12] = ((short *)&y0r)[2];
    stream0_out[j + 13] = ((short *)&y1r)[2];
    stream0_out[j + 14] = ((short *)&y2r)[2];
    stream0_out[j + 15] = ((short *)&y0i)[2];
    stream0_out[j + 16] = ((short *)&y1i)[2];
    stream0_out[j + 17] = ((short *)&y2i)[2];
    // RE 4
    stream0_out[j + 18] = ((short *)&y0r)[3];
    stream0_out[j + 19] = ((short *)&y1r)[3];
    stream0_out[j + 20] = ((short *)&y2r)[3];
    stream0_out[j + 21] = ((short *)&y0i)[3];
    stream0_out[j + 22] = ((short *)&y1i)[3];
    stream0_out[j + 23] = ((short *)&y2i)[3];
    // RE 5
    stream0_out[j + 24] = ((short *)&y0r)[4];
    stream0_out[j + 25] = ((short *)&y1r)[4];
    stream0_out[j + 26] = ((short *)&y2r)[4];
    stream0_out[j + 27] = ((short *)&y0i)[4];
    stream0_out[j + 28] = ((short *)&y1i)[4];
    stream0_out[j + 29] = ((short *)&y2i)[4];
    // RE 6
    stream0_out[j + 30] = ((short *)&y0r)[5];
    stream0_out[j + 31] = ((short *)&y1r)[5];
    stream0_out[j + 32] = ((short *)&y2r)[5];
    stream0_out[j + 33] = ((short *)&y0i)[5];
    stream0_out[j + 34] = ((short *)&y1i)[5];
    stream0_out[j + 35] = ((short *)&y2i)[5];
    // RE 7
    stream0_out[j + 36] = ((short *)&y0r)[6];
    stream0_out[j + 37] = ((short *)&y1r)[6];
    stream0_out[j + 38] = ((short *)&y2r)[6];
    stream0_out[j + 39] = ((short *)&y0i)[6];
    stream0_out[j + 40] = ((short *)&y1i)[6];
    stream0_out[j + 41] = ((short *)&y2i)[6];
    // RE 8
    stream0_out[j + 42] = ((short *)&y0r)[7];
    stream0_out[j + 43] = ((short *)&y1r)[7];
    stream0_out[j + 44] = ((short *)&y2r)[7];
    stream0_out[j + 45] = ((short *)&y0i)[7];
    stream0_out[j + 46] = ((short *)&y1i)[7];
    stream0_out[j + 47] = ((short *)&y2i)[7];

    // RE 9
    stream0_out[j + 48] = ((short *)&y0r)[8];
    stream0_out[j + 49] = ((short *)&y1r)[8];
    stream0_out[j + 50] = ((short *)&y2r)[8];
    stream0_out[j + 51] = ((short *)&y0i)[8];
    stream0_out[j + 52] = ((short *)&y1i)[8];
    stream0_out[j + 53] = ((short *)&y2i)[8];
    // RE 10
    stream0_out[j + 54] = ((short *)&y0r)[9];
    stream0_out[j + 55] = ((short *)&y1r)[9];
    stream0_out[j + 56] = ((short *)&y2r)[9];
    stream0_out[j + 57] = ((short *)&y0i)[9];
    stream0_out[j + 58] = ((short *)&y1i)[9];
    stream0_out[j + 59] = ((short *)&y2i)[9];
    // RE 11
    stream0_out[j + 60] = ((short *)&y0r)[10];
    stream0_out[j + 61] = ((short *)&y1r)[10];
    stream0_out[j + 62] = ((short *)&y2r)[10];
    stream0_out[j + 63] = ((short *)&y0i)[10];
    stream0_out[j + 64] = ((short *)&y1i)[10];
    stream0_out[j + 65] = ((short *)&y2i)[10];
    // RE 12
    stream0_out[j + 66] = ((short *)&y0r)[11];
    stream0_out[j + 67] = ((short *)&y1r)[11];
    stream0_out[j + 68] = ((short *)&y2r)[11];
    stream0_out[j + 69] = ((short *)&y0i)[11];
    stream0_out[j + 70] = ((short *)&y1i)[11];
    stream0_out[j + 71] = ((short *)&y2i)[11];
    // RE 13
    stream0_out[j + 72] = ((short *)&y0r)[12];
    stream0_out[j + 73] = ((short *)&y1r)[12];
    stream0_out[j + 74] = ((short *)&y2r)[12];
    stream0_out[j + 75] = ((short *)&y0i)[12];
    stream0_out[j + 76] = ((short *)&y1i)[12];
    stream0_out[j + 77] = ((short *)&y2i)[12];
    // RE 14
    stream0_out[j + 78] = ((short *)&y0r)[13];
    stream0_out[j + 79] = ((short *)&y1r)[13];
    stream0_out[j + 80] = ((short *)&y2r)[13];
    stream0_out[j + 81] = ((short *)&y0i)[13];
    stream0_out[j + 82] = ((short *)&y1i)[13];
    stream0_out[j + 83] = ((short *)&y2i)[13];
    // RE 15
    stream0_out[j + 84] = ((short *)&y0r)[14];
    stream0_out[j + 85] = ((short *)&y1r)[14];
    stream0_out[j + 86] = ((short *)&y2r)[14];
    stream0_out[j + 87] = ((short *)&y0i)[14];
    stream0_out[j + 88] = ((short *)&y1i)[14];
    stream0_out[j + 89] = ((short *)&y2i)[14];
    // RE 16
    stream0_out[j + 90] = ((short *)&y0r)[15];
    stream0_out[j + 91] = ((short *)&y1r)[15];
    stream0_out[j + 92] = ((short *)&y2r)[15];
    stream0_out[j + 93] = ((short *)&y0i)[15];
    stream0_out[j + 94] = ((short *)&y1i)[15];
    stream0_out[j + 95] = ((short *)&y2i)[15];

#elif defined(__arm__)

#endif
  }

#if defined(__x86_64__) || defined(__i386__)
  _mm_empty();
  _m_empty();
#endif

}

void qam64_qam64_avx2(int32_t *stream0_in,
                      int32_t *stream1_in,
                      int32_t *ch_mag,
                      int32_t *ch_mag_i,
                      int16_t *stream0_out,
                      int32_t *rho01,
                      int length
    )
{

  /*
    Author: S. Wagner
    Date: 28-02-17

    Input:
    stream0_in:  MF filter for 1st stream, i.e., y0=h0'*y
    stream1_in:  MF filter for 2nd stream, i.e., y1=h1'*y
    ch_mag:      4*h0/sqrt(42), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
    ch_mag_i:    4*h1/sqrt(42), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
    rho01:       Channel cross correlation, i.e., h1'*h0

    Output:
    stream0_out: output LLRs for 1st stream
  */

#if defined(__x86_64__) || defined(__i386__)

  __m256i *rho01_256i      = (__m256i *)rho01;
  __m256i *stream0_256i_in = (__m256i *)stream0_in;
  __m256i *stream1_256i_in = (__m256i *)stream1_in;
  __m256i *ch_mag_256i     = (__m256i *)ch_mag;
  __m256i *ch_mag_256i_i   = (__m256i *)ch_mag_i;

  __m256i ONE_OVER_SQRT_42              = _mm256_broadcastw_epi16(_mm_set1_epi16(10112)); // round(1/sqrt(42)*2^16)
  __m256i THREE_OVER_SQRT_42            = _mm256_broadcastw_epi16(_mm_set1_epi16(30337)); // round(3/sqrt(42)*2^16)
  __m256i FIVE_OVER_SQRT_42             = _mm256_broadcastw_epi16(_mm_set1_epi16(25281)); // round(5/sqrt(42)*2^15)
  __m256i SEVEN_OVER_SQRT_42            = _mm256_broadcastw_epi16(_mm_set1_epi16(17697)); // round(7/sqrt(42)*2^14) Q2.14
  __m256i ONE_OVER_SQRT_2               = _mm256_broadcastw_epi16(_mm_set1_epi16(23170)); // round(1/sqrt(2)*2^15)
  __m256i ONE_OVER_SQRT_2_42            = _mm256_broadcastw_epi16(_mm_set1_epi16(3575));  // round(1/sqrt(2*42)*2^15)
  __m256i THREE_OVER_SQRT_2_42          = _mm256_broadcastw_epi16(_mm_set1_epi16(10726)); // round(3/sqrt(2*42)*2^15)
  __m256i FIVE_OVER_SQRT_2_42           = _mm256_broadcastw_epi16(_mm_set1_epi16(17876)); // round(5/sqrt(2*42)*2^15)
  __m256i SEVEN_OVER_SQRT_2_42          = _mm256_broadcastw_epi16(_mm_set1_epi16(25027)); // round(7/sqrt(2*42)*2^15)
  __m256i FORTYNINE_OVER_FOUR_SQRT_42   = _mm256_broadcastw_epi16(_mm_set1_epi16(30969)); // round(49/(4*sqrt(42))*2^14), Q2.14
  __m256i THIRTYSEVEN_OVER_FOUR_SQRT_42 = _mm256_broadcastw_epi16(_mm_set1_epi16(23385)); // round(37/(4*sqrt(42))*2^14), Q2.14
  __m256i TWENTYFIVE_OVER_FOUR_SQRT_42  = _mm256_broadcastw_epi16(_mm_set1_epi16(31601)); // round(25/(4*sqrt(42))*2^15)
  __m256i TWENTYNINE_OVER_FOUR_SQRT_42  = _mm256_broadcastw_epi16(_mm_set1_epi16(18329)); // round(29/(4*sqrt(42))*2^15), Q2.14
  __m256i SEVENTEEN_OVER_FOUR_SQRT_42   = _mm256_broadcastw_epi16(_mm_set1_epi16(21489)); // round(17/(4*sqrt(42))*2^15)
  __m256i NINE_OVER_FOUR_SQRT_42        = _mm256_broadcastw_epi16(_mm_set1_epi16(11376)); // round(9/(4*sqrt(42))*2^15)
  __m256i THIRTEEN_OVER_FOUR_SQRT_42    = _mm256_broadcastw_epi16(_mm_set1_epi16(16433)); // round(13/(4*sqrt(42))*2^15)
  __m256i FIVE_OVER_FOUR_SQRT_42        = _mm256_broadcastw_epi16(_mm_set1_epi16(6320));  // round(5/(4*sqrt(42))*2^15)
  __m256i ONE_OVER_FOUR_SQRT_42         = _mm256_broadcastw_epi16(_mm_set1_epi16(1264));  // round(1/(4*sqrt(42))*2^15)
  __m256i SQRT_42_OVER_FOUR             = _mm256_broadcastw_epi16(_mm_set1_epi16(13272)); // round(sqrt(42)/4*2^13), Q3.12

  __m256i ch_mag_des;
  __m256i ch_mag_int;
  __m256i ch_mag_98_over_42_with_sigma2;
  __m256i ch_mag_74_over_42_with_sigma2;
  __m256i ch_mag_58_over_42_with_sigma2;
  __m256i ch_mag_50_over_42_with_sigma2;
  __m256i ch_mag_34_over_42_with_sigma2;
  __m256i ch_mag_18_over_42_with_sigma2;
  __m256i ch_mag_26_over_42_with_sigma2;
  __m256i ch_mag_10_over_42_with_sigma2;
  __m256i ch_mag_2_over_42_with_sigma2;
  __m256i y0r_one_over_sqrt_21;
  __m256i y0r_three_over_sqrt_21;
  __m256i y0r_five_over_sqrt_21;
  __m256i y0r_seven_over_sqrt_21;
  __m256i y0i_one_over_sqrt_21;
  __m256i y0i_three_over_sqrt_21;
  __m256i y0i_five_over_sqrt_21;
  __m256i y0i_seven_over_sqrt_21;
  __m256i ch_mag_int_with_sigma2;
  __m256i two_ch_mag_int_with_sigma2;
  __m256i three_ch_mag_int_with_sigma2;
#elif defined(__arm__)

#endif

  int i,j;
  uint32_t len256 = (length)>>3;

  for (i=0; i<len256; i+=2) {

#if defined(__x86_64__) || defined(__i386__)

    // Get rho
      /*
    xmm0 = rho01_256i[i];
    xmm1 = rho01_256i[i+1];
    xmm0 = _mm256_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm256_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm256_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));

    xmm1 = _mm256_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm256_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm256_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));

    //xmm0 = [Re(0,1,2,3)   Im(0,1,2,3)   Re(4,5,6,7)     Im(4,5,6,7)]
    //xmm0 = [Re(8,9,10,11) Im(8,9,10,11) Re(12,13,14,15) Im(12,13,14,15)]

    xmm2 = _mm256_unpacklo_epi64(xmm0, xmm1);
    //xmm2 = [Re(0,1,2,3) Re(8,9,10,11) Re(4,5,6,7) Re(12,13,14,15)]
    xmm2 = _mm256_permute4x64_epi64(xmm2,0xd8); // Re(rho)

    xmm3 = _mm256_unpackhi_epi64(xmm0, xmm1);
    //xmm3 = [Im(0,1,2,3) Im(8,9,10,11) Im(4,5,6,7) Im(12,13,14,15)]
    xmm3 = _mm256_permute4x64_epi64(xmm3,0xd8); // Im(rho)
      */

    seperate_real_imag_parts(&xmm2, &xmm3, rho01_256i[i], rho01_256i[i+1]);

    rho_rpi = _mm256_adds_epi16(xmm2,xmm3); // rho = Re(rho) + Im(rho)
    rho_rmi = _mm256_subs_epi16(xmm2,xmm3); // rho* = Re(rho) - Im(rho)

    // Compute the different rhos
    rho_rpi_1_1 = _mm256_mulhi_epi16(rho_rpi, ONE_OVER_SQRT_42);
    rho_rmi_1_1 = _mm256_mulhi_epi16(rho_rmi, ONE_OVER_SQRT_42);
    rho_rpi_3_3 = _mm256_mulhi_epi16(rho_rpi, THREE_OVER_SQRT_42);
    rho_rmi_3_3 = _mm256_mulhi_epi16(rho_rmi, THREE_OVER_SQRT_42);
    rho_rpi_5_5 = _mm256_mulhi_epi16(rho_rpi, FIVE_OVER_SQRT_42);
    rho_rmi_5_5 = _mm256_mulhi_epi16(rho_rmi, FIVE_OVER_SQRT_42);
    rho_rpi_7_7 = _mm256_mulhi_epi16(rho_rpi, SEVEN_OVER_SQRT_42);
    rho_rmi_7_7 = _mm256_mulhi_epi16(rho_rmi, SEVEN_OVER_SQRT_42);

    rho_rpi_5_5 = _mm256_slli_epi16(rho_rpi_5_5, 1);
    rho_rmi_5_5 = _mm256_slli_epi16(rho_rmi_5_5, 1);
    rho_rpi_7_7 = _mm256_slli_epi16(rho_rpi_7_7, 2);
    rho_rmi_7_7 = _mm256_slli_epi16(rho_rmi_7_7, 2);

    xmm4 = _mm256_mulhi_epi16(xmm2, ONE_OVER_SQRT_42);
    xmm5 = _mm256_mulhi_epi16(xmm3, ONE_OVER_SQRT_42);
    xmm6 = _mm256_mulhi_epi16(xmm3, THREE_OVER_SQRT_42);
    xmm7 = _mm256_mulhi_epi16(xmm3, FIVE_OVER_SQRT_42);
    xmm8 = _mm256_mulhi_epi16(xmm3, SEVEN_OVER_SQRT_42);
    xmm7 = _mm256_slli_epi16(xmm7, 1);
    xmm8 = _mm256_slli_epi16(xmm8, 2);

    rho_rpi_1_3 = _mm256_adds_epi16(xmm4, xmm6);
    rho_rmi_1_3 = _mm256_subs_epi16(xmm4, xmm6);
    rho_rpi_1_5 = _mm256_adds_epi16(xmm4, xmm7);
    rho_rmi_1_5 = _mm256_subs_epi16(xmm4, xmm7);
    rho_rpi_1_7 = _mm256_adds_epi16(xmm4, xmm8);
    rho_rmi_1_7 = _mm256_subs_epi16(xmm4, xmm8);

    xmm4 = _mm256_mulhi_epi16(xmm2, THREE_OVER_SQRT_42);
    rho_rpi_3_1 = _mm256_adds_epi16(xmm4, xmm5);
    rho_rmi_3_1 = _mm256_subs_epi16(xmm4, xmm5);
    rho_rpi_3_5 = _mm256_adds_epi16(xmm4, xmm7);
    rho_rmi_3_5 = _mm256_subs_epi16(xmm4, xmm7);
    rho_rpi_3_7 = _mm256_adds_epi16(xmm4, xmm8);
    rho_rmi_3_7 = _mm256_subs_epi16(xmm4, xmm8);

    xmm4 = _mm256_mulhi_epi16(xmm2, FIVE_OVER_SQRT_42);
    xmm4 = _mm256_slli_epi16(xmm4, 1);
    rho_rpi_5_1 = _mm256_adds_epi16(xmm4, xmm5);
    rho_rmi_5_1 = _mm256_subs_epi16(xmm4, xmm5);
    rho_rpi_5_3 = _mm256_adds_epi16(xmm4, xmm6);
    rho_rmi_5_3 = _mm256_subs_epi16(xmm4, xmm6);
    rho_rpi_5_7 = _mm256_adds_epi16(xmm4, xmm8);
    rho_rmi_5_7 = _mm256_subs_epi16(xmm4, xmm8);

    xmm4 = _mm256_mulhi_epi16(xmm2, SEVEN_OVER_SQRT_42);
    xmm4 = _mm256_slli_epi16(xmm4, 2);
    rho_rpi_7_1 = _mm256_adds_epi16(xmm4, xmm5);
    rho_rmi_7_1 = _mm256_subs_epi16(xmm4, xmm5);
    rho_rpi_7_3 = _mm256_adds_epi16(xmm4, xmm6);
    rho_rmi_7_3 = _mm256_subs_epi16(xmm4, xmm6);
    rho_rpi_7_5 = _mm256_adds_epi16(xmm4, xmm7);
    rho_rmi_7_5 = _mm256_subs_epi16(xmm4, xmm7);

    // Rearrange interfering MF output
    /*
    xmm0 = stream1_256i_in[i];
    xmm1 = stream1_256i_in[i+1];
    xmm0 = _mm256_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm256_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm256_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));

    xmm1 = _mm256_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm256_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm256_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));

    y1r = _mm256_unpacklo_epi64(xmm0, xmm1);
    y1r = _mm256_permute4x64_epi64(y1r,0xd8); // Re(y1)

    y1i = _mm256_unpackhi_epi64(xmm0, xmm1);
    y1i = _mm256_permute4x64_epi64(y1i,0xd8); // Im(y1)
    */

    seperate_real_imag_parts(&y1r, &y1i, stream1_256i_in[i], stream1_256i_in[i+1]);

    // Psi_r calculation from rho_rpi or rho_rmi
    xmm0 = _mm256_broadcastw_epi16(_mm_set1_epi16(0));// ZERO for abs_pi16
    xmm2 = _mm256_subs_epi16(rho_rpi_7_7, y1r);

    psi_r_p7_p7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_7_5, y1r);
    psi_r_p7_p5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_7_3, y1r);
    psi_r_p7_p3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_7_1, y1r);
    psi_r_p7_p1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_7_1, y1r);
    psi_r_p7_m1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_7_3, y1r);
    psi_r_p7_m3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_7_5, y1r);
    psi_r_p7_m5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_7_7, y1r);
    psi_r_p7_m7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_5_7, y1r);
    psi_r_p5_p7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_5_5, y1r);
    psi_r_p5_p5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_5_3, y1r);
    psi_r_p5_p3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_5_1, y1r);
    psi_r_p5_p1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_5_1, y1r);
    psi_r_p5_m1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_5_3, y1r);
    psi_r_p5_m3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_5_5, y1r);
    psi_r_p5_m5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_5_7, y1r);
    psi_r_p5_m7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_3_7, y1r);
    psi_r_p3_p7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_3_5, y1r);
    psi_r_p3_p5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_3_3, y1r);
    psi_r_p3_p3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_3_1, y1r);
    psi_r_p3_p1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_3_1, y1r);
    psi_r_p3_m1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_3_3, y1r);
    psi_r_p3_m3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_3_5, y1r);
    psi_r_p3_m5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_3_7, y1r);
    psi_r_p3_m7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_1_7, y1r);
    psi_r_p1_p7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_1_5, y1r);
    psi_r_p1_p5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_1_3, y1r);
    psi_r_p1_p3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_1_1, y1r);
    psi_r_p1_p1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_1_1, y1r);
    psi_r_p1_m1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_1_3, y1r);
    psi_r_p1_m3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_1_5, y1r);
    psi_r_p1_m5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_1_7, y1r);
    psi_r_p1_m7 = _mm256_abs_epi16(xmm2);

    xmm2 = _mm256_adds_epi16(rho_rmi_1_7, y1r);
    psi_r_m1_p7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_1_5, y1r);
    psi_r_m1_p5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_1_3, y1r);
    psi_r_m1_p3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_1_1, y1r);
    psi_r_m1_p1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_1_1, y1r);
    psi_r_m1_m1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_1_3, y1r);
    psi_r_m1_m3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_1_5, y1r);
    psi_r_m1_m5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_1_7, y1r);
    psi_r_m1_m7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_3_7, y1r);
    psi_r_m3_p7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_3_5, y1r);
    psi_r_m3_p5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_3_3, y1r);
    psi_r_m3_p3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_3_1, y1r);
    psi_r_m3_p1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_3_1, y1r);
    psi_r_m3_m1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_3_3, y1r);
    psi_r_m3_m3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_3_5, y1r);
    psi_r_m3_m5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_3_7, y1r);
    psi_r_m3_m7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_5_7, y1r);
    psi_r_m5_p7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_5_5, y1r);
    psi_r_m5_p5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_5_3, y1r);
    psi_r_m5_p3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_5_1, y1r);
    psi_r_m5_p1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_5_1, y1r);
    psi_r_m5_m1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_5_3, y1r);
    psi_r_m5_m3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_5_5, y1r);
    psi_r_m5_m5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_5_7, y1r);
    psi_r_m5_m7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_7_7, y1r);
    psi_r_m7_p7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_7_5, y1r);
    psi_r_m7_p5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_7_3, y1r);
    psi_r_m7_p3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_7_1, y1r);
    psi_r_m7_p1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_7_1, y1r);
    psi_r_m7_m1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_7_3, y1r);
    psi_r_m7_m3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_7_5, y1r);
    psi_r_m7_m5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_7_7, y1r);
    psi_r_m7_m7 = _mm256_abs_epi16(xmm2);

    // Psi_i calculation from rho_rpi or rho_rmi
    xmm2 = _mm256_subs_epi16(rho_rmi_7_7, y1i);
    psi_i_p7_p7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_5_7, y1i);
    psi_i_p7_p5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_3_7, y1i);
    psi_i_p7_p3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_1_7, y1i);
    psi_i_p7_p1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_1_7, y1i);
    psi_i_p7_m1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_3_7, y1i);
    psi_i_p7_m3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_5_7, y1i);
    psi_i_p7_m5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_7_7, y1i);
    psi_i_p7_m7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_7_5, y1i);
    psi_i_p5_p7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_5_5, y1i);
    psi_i_p5_p5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_3_5, y1i);
    psi_i_p5_p3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_1_5, y1i);
    psi_i_p5_p1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_1_5, y1i);
    psi_i_p5_m1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_3_5, y1i);
    psi_i_p5_m3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_5_5, y1i);
    psi_i_p5_m5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_7_5, y1i);
    psi_i_p5_m7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_7_3, y1i);
    psi_i_p3_p7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_5_3, y1i);
    psi_i_p3_p5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_3_3, y1i);
    psi_i_p3_p3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_1_3, y1i);
    psi_i_p3_p1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_1_3, y1i);
    psi_i_p3_m1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_3_3, y1i);
    psi_i_p3_m3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_5_3, y1i);
    psi_i_p3_m5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_7_3, y1i);
    psi_i_p3_m7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_7_1, y1i);
    psi_i_p1_p7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_5_1, y1i);
    psi_i_p1_p5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_3_1, y1i);
    psi_i_p1_p3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rmi_1_1, y1i);
    psi_i_p1_p1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_1_1, y1i);
    psi_i_p1_m1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_3_1, y1i);
    psi_i_p1_m3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_5_1, y1i);
    psi_i_p1_m5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rpi_7_1, y1i);
    psi_i_p1_m7 = _mm256_abs_epi16(xmm2);

    xmm2 = _mm256_subs_epi16(rho_rpi_7_1, y1i);
    psi_i_m1_p7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_5_1, y1i);
    psi_i_m1_p5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_3_1, y1i);
    psi_i_m1_p3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_1_1, y1i);
    psi_i_m1_p1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_1_1, y1i);
    psi_i_m1_m1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_3_1, y1i);
    psi_i_m1_m3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_5_1, y1i);
    psi_i_m1_m5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_7_1, y1i);
    psi_i_m1_m7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_7_3, y1i);
    psi_i_m3_p7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_5_3, y1i);
    psi_i_m3_p5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_3_3, y1i);
    psi_i_m3_p3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_1_3, y1i);
    psi_i_m3_p1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_1_3, y1i);
    psi_i_m3_m1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_3_3, y1i);
    psi_i_m3_m3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_5_3, y1i);
    psi_i_m3_m5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_7_3, y1i);
    psi_i_m3_m7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_7_5, y1i);
    psi_i_m5_p7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_5_5, y1i);
    psi_i_m5_p5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_3_5, y1i);
    psi_i_m5_p3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_1_5, y1i);
    psi_i_m5_p1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_1_5, y1i);
    psi_i_m5_m1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_3_5, y1i);
    psi_i_m5_m3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_5_5, y1i);
    psi_i_m5_m5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_7_5, y1i);
    psi_i_m5_m7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_7_7, y1i);
    psi_i_m7_p7 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_5_7, y1i);
    psi_i_m7_p5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_3_7, y1i);
    psi_i_m7_p3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_subs_epi16(rho_rpi_1_7, y1i);
    psi_i_m7_p1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_1_7, y1i);
    psi_i_m7_m1 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_3_7, y1i);
    psi_i_m7_m3 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_5_7, y1i);
    psi_i_m7_m5 = _mm256_abs_epi16(xmm2);
    xmm2 = _mm256_adds_epi16(rho_rmi_7_7, y1i);
    psi_i_m7_m7 = _mm256_abs_epi16(xmm2);

    /*
    // Rearrange desired MF output
    xmm0 = stream0_256i_in[i];
    xmm1 = stream0_256i_in[i+1];
    xmm0 = _mm_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    //xmm0 = [Re(0,1) Re(2,3) Im(0,1) Im(2,3)]
    //xmm1 = [Re(4,5) Re(6,7) Im(4,5) Im(6,7)]
    y0r = _mm_unpacklo_epi64(xmm0,xmm1); // = [y0r(1),y0r(2),y0r(3),y0r(4)]
    y0i = _mm_unpackhi_epi64(xmm0,xmm1);
    */
    seperate_real_imag_parts(&y0r, &y0i, stream0_256i_in[i], stream0_256i_in[i+1]);

    // Rearrange desired channel magnitudes
    // [|h|^2(1),|h|^2(1),|h|^2(2),|h|^2(2),...,,|h|^2(7),|h|^2(7)]*(2/sqrt(10))
    /*
    xmm2 = ch_mag_256i[i];
    xmm3 = ch_mag_256i[i+1];
    xmm2 = _mm_shufflelo_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm2 = _mm_shufflehi_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm2 = _mm_shuffle_epi32(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shufflelo_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shufflehi_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shuffle_epi32(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    ch_mag_des = _mm_unpacklo_epi64(xmm2,xmm3);
    */
    // xmm2 is dummy variable that contains the same values as ch_mag_des
    seperate_real_imag_parts(&ch_mag_des, &xmm2, ch_mag_256i[i], ch_mag_256i[i+1]);


    // Rearrange interfering channel magnitudes
    /*
    xmm2 = ch_mag_256i_i[i];
    xmm3 = ch_mag_256i_i[i+1];
    xmm2 = _mm_shufflelo_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm2 = _mm_shufflehi_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm2 = _mm_shuffle_epi32(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shufflelo_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shufflehi_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shuffle_epi32(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    ch_mag_int  = _mm_unpacklo_epi64(xmm2,xmm3);
    */
    seperate_real_imag_parts(&ch_mag_int, &xmm2, ch_mag_256i_i[i], ch_mag_256i_i[i+1]);

    y0r_one_over_sqrt_21   = _mm256_mulhi_epi16(y0r, ONE_OVER_SQRT_42);
    y0r_three_over_sqrt_21 = _mm256_mulhi_epi16(y0r, THREE_OVER_SQRT_42);
    y0r_five_over_sqrt_21  = _mm256_mulhi_epi16(y0r, FIVE_OVER_SQRT_42);
    y0r_five_over_sqrt_21  = _mm256_slli_epi16(y0r_five_over_sqrt_21, 1);
    y0r_seven_over_sqrt_21 = _mm256_mulhi_epi16(y0r, SEVEN_OVER_SQRT_42);
    y0r_seven_over_sqrt_21 = _mm256_slli_epi16(y0r_seven_over_sqrt_21, 2); // Q2.14

    y0i_one_over_sqrt_21   = _mm256_mulhi_epi16(y0i, ONE_OVER_SQRT_42);
    y0i_three_over_sqrt_21 = _mm256_mulhi_epi16(y0i, THREE_OVER_SQRT_42);
    y0i_five_over_sqrt_21  = _mm256_mulhi_epi16(y0i, FIVE_OVER_SQRT_42);
    y0i_five_over_sqrt_21  = _mm256_slli_epi16(y0i_five_over_sqrt_21, 1);
    y0i_seven_over_sqrt_21 = _mm256_mulhi_epi16(y0i, SEVEN_OVER_SQRT_42);
    y0i_seven_over_sqrt_21 = _mm256_slli_epi16(y0i_seven_over_sqrt_21, 2); // Q2.14


    y0_p_7_1 = _mm256_adds_epi16(y0r_seven_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_p_7_3 = _mm256_adds_epi16(y0r_seven_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_p_7_5 = _mm256_adds_epi16(y0r_seven_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_p_7_7 = _mm256_adds_epi16(y0r_seven_over_sqrt_21, y0i_seven_over_sqrt_21);
    y0_p_5_1 = _mm256_adds_epi16(y0r_five_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_p_5_3 = _mm256_adds_epi16(y0r_five_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_p_5_5 = _mm256_adds_epi16(y0r_five_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_p_5_7 = _mm256_adds_epi16(y0r_five_over_sqrt_21, y0i_seven_over_sqrt_21);
    y0_p_3_1 = _mm256_adds_epi16(y0r_three_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_p_3_3 = _mm256_adds_epi16(y0r_three_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_p_3_5 = _mm256_adds_epi16(y0r_three_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_p_3_7 = _mm256_adds_epi16(y0r_three_over_sqrt_21, y0i_seven_over_sqrt_21);
    y0_p_1_1 = _mm256_adds_epi16(y0r_one_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_p_1_3 = _mm256_adds_epi16(y0r_one_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_p_1_5 = _mm256_adds_epi16(y0r_one_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_p_1_7 = _mm256_adds_epi16(y0r_one_over_sqrt_21, y0i_seven_over_sqrt_21);

    y0_m_1_1 = _mm256_subs_epi16(y0r_one_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_m_1_3 = _mm256_subs_epi16(y0r_one_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_m_1_5 = _mm256_subs_epi16(y0r_one_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_m_1_7 = _mm256_subs_epi16(y0r_one_over_sqrt_21, y0i_seven_over_sqrt_21);
    y0_m_3_1 = _mm256_subs_epi16(y0r_three_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_m_3_3 = _mm256_subs_epi16(y0r_three_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_m_3_5 = _mm256_subs_epi16(y0r_three_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_m_3_7 = _mm256_subs_epi16(y0r_three_over_sqrt_21, y0i_seven_over_sqrt_21);
    y0_m_5_1 = _mm256_subs_epi16(y0r_five_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_m_5_3 = _mm256_subs_epi16(y0r_five_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_m_5_5 = _mm256_subs_epi16(y0r_five_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_m_5_7 = _mm256_subs_epi16(y0r_five_over_sqrt_21, y0i_seven_over_sqrt_21);
    y0_m_7_1 = _mm256_subs_epi16(y0r_seven_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_m_7_3 = _mm256_subs_epi16(y0r_seven_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_m_7_5 = _mm256_subs_epi16(y0r_seven_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_m_7_7 = _mm256_subs_epi16(y0r_seven_over_sqrt_21, y0i_seven_over_sqrt_21);

    // Detection of interference term
    ch_mag_int_with_sigma2       = _mm256_srai_epi16(ch_mag_int, 1); // *2
    two_ch_mag_int_with_sigma2   = ch_mag_int; // *4
    three_ch_mag_int_with_sigma2 = _mm256_adds_epi16(ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2); // *6

    interference_abs_64qam_epi16(psi_r_p7_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p7_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p7_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p7_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p7_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p7_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p7_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p7_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p7_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p7_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p7_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p7_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p7_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p7_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p7_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p7_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p5_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p5_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p5_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p5_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p5_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p5_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p5_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p5_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p5_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p5_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p5_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p5_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p5_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p5_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p5_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p5_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p3_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p3_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p3_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p3_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p3_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p3_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p3_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p3_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p3_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p3_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p3_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p3_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p3_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p3_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p3_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p3_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p1_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p1_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p1_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p1_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p1_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p1_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p1_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p1_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p1_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p1_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p1_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p1_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p1_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p1_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p1_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p1_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m1_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m1_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m1_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m1_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m1_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m1_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m1_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m1_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m1_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m1_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m1_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m1_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m1_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m1_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m1_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m1_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m3_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m3_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m3_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m3_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m3_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m3_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m3_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m3_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m3_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m3_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m3_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m3_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m3_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m3_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m3_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m3_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m5_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m5_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m5_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m5_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m5_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m5_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m5_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m5_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m5_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m5_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m5_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m5_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m5_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m5_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m5_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m5_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m7_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m7_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m7_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m7_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m7_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m7_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m7_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m7_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m7_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m7_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m7_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m7_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m7_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m7_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m7_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m7_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);

    interference_abs_64qam_epi16(psi_i_p7_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p7_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p7_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p7_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p7_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p7_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p7_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p7_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p7_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p7_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p7_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p7_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p7_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p7_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p7_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p7_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p5_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p5_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p5_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p5_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p5_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p5_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p5_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p5_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p5_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p5_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p5_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p5_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p5_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p5_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p5_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p5_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p3_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p3_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p3_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p3_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p3_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p3_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p3_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p3_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p3_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p3_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p3_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p3_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p3_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p3_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p3_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p3_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p1_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p1_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p1_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p1_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p1_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p1_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p1_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p1_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p1_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p1_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p1_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p1_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p1_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p1_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p1_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p1_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m1_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m1_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m1_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m1_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m1_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m1_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m1_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m1_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m1_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m1_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m1_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m1_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m1_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m1_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m1_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m1_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m3_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m3_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m3_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m3_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m3_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m3_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m3_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m3_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m3_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m3_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m3_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m3_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m3_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m3_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m3_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m3_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m5_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m5_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m5_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m5_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m5_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m5_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m5_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m5_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m5_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m5_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m5_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m5_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m5_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m5_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m5_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m5_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m7_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m7_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m7_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m7_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m7_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m7_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m7_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m7_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m7_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m7_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m7_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m7_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m7_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m7_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m7_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m7_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);

    // Calculation of a group of two terms in the bit metric involving product of psi and interference
    prodsum_psi_a_epi16(psi_r_p7_p7, a_r_p7_p7, psi_i_p7_p7, a_i_p7_p7, psi_a_p7_p7);
    prodsum_psi_a_epi16(psi_r_p7_p5, a_r_p7_p5, psi_i_p7_p5, a_i_p7_p5, psi_a_p7_p5);
    prodsum_psi_a_epi16(psi_r_p7_p3, a_r_p7_p3, psi_i_p7_p3, a_i_p7_p3, psi_a_p7_p3);
    prodsum_psi_a_epi16(psi_r_p7_p1, a_r_p7_p1, psi_i_p7_p1, a_i_p7_p1, psi_a_p7_p1);
    prodsum_psi_a_epi16(psi_r_p7_m1, a_r_p7_m1, psi_i_p7_m1, a_i_p7_m1, psi_a_p7_m1);
    prodsum_psi_a_epi16(psi_r_p7_m3, a_r_p7_m3, psi_i_p7_m3, a_i_p7_m3, psi_a_p7_m3);
    prodsum_psi_a_epi16(psi_r_p7_m5, a_r_p7_m5, psi_i_p7_m5, a_i_p7_m5, psi_a_p7_m5);
    prodsum_psi_a_epi16(psi_r_p7_m7, a_r_p7_m7, psi_i_p7_m7, a_i_p7_m7, psi_a_p7_m7);
    prodsum_psi_a_epi16(psi_r_p5_p7, a_r_p5_p7, psi_i_p5_p7, a_i_p5_p7, psi_a_p5_p7);
    prodsum_psi_a_epi16(psi_r_p5_p5, a_r_p5_p5, psi_i_p5_p5, a_i_p5_p5, psi_a_p5_p5);
    prodsum_psi_a_epi16(psi_r_p5_p3, a_r_p5_p3, psi_i_p5_p3, a_i_p5_p3, psi_a_p5_p3);
    prodsum_psi_a_epi16(psi_r_p5_p1, a_r_p5_p1, psi_i_p5_p1, a_i_p5_p1, psi_a_p5_p1);
    prodsum_psi_a_epi16(psi_r_p5_m1, a_r_p5_m1, psi_i_p5_m1, a_i_p5_m1, psi_a_p5_m1);
    prodsum_psi_a_epi16(psi_r_p5_m3, a_r_p5_m3, psi_i_p5_m3, a_i_p5_m3, psi_a_p5_m3);
    prodsum_psi_a_epi16(psi_r_p5_m5, a_r_p5_m5, psi_i_p5_m5, a_i_p5_m5, psi_a_p5_m5);
    prodsum_psi_a_epi16(psi_r_p5_m7, a_r_p5_m7, psi_i_p5_m7, a_i_p5_m7, psi_a_p5_m7);
    prodsum_psi_a_epi16(psi_r_p3_p7, a_r_p3_p7, psi_i_p3_p7, a_i_p3_p7, psi_a_p3_p7);
    prodsum_psi_a_epi16(psi_r_p3_p5, a_r_p3_p5, psi_i_p3_p5, a_i_p3_p5, psi_a_p3_p5);
    prodsum_psi_a_epi16(psi_r_p3_p3, a_r_p3_p3, psi_i_p3_p3, a_i_p3_p3, psi_a_p3_p3);
    prodsum_psi_a_epi16(psi_r_p3_p1, a_r_p3_p1, psi_i_p3_p1, a_i_p3_p1, psi_a_p3_p1);
    prodsum_psi_a_epi16(psi_r_p3_m1, a_r_p3_m1, psi_i_p3_m1, a_i_p3_m1, psi_a_p3_m1);
    prodsum_psi_a_epi16(psi_r_p3_m3, a_r_p3_m3, psi_i_p3_m3, a_i_p3_m3, psi_a_p3_m3);
    prodsum_psi_a_epi16(psi_r_p3_m5, a_r_p3_m5, psi_i_p3_m5, a_i_p3_m5, psi_a_p3_m5);
    prodsum_psi_a_epi16(psi_r_p3_m7, a_r_p3_m7, psi_i_p3_m7, a_i_p3_m7, psi_a_p3_m7);
    prodsum_psi_a_epi16(psi_r_p1_p7, a_r_p1_p7, psi_i_p1_p7, a_i_p1_p7, psi_a_p1_p7);
    prodsum_psi_a_epi16(psi_r_p1_p5, a_r_p1_p5, psi_i_p1_p5, a_i_p1_p5, psi_a_p1_p5);
    prodsum_psi_a_epi16(psi_r_p1_p3, a_r_p1_p3, psi_i_p1_p3, a_i_p1_p3, psi_a_p1_p3);
    prodsum_psi_a_epi16(psi_r_p1_p1, a_r_p1_p1, psi_i_p1_p1, a_i_p1_p1, psi_a_p1_p1);
    prodsum_psi_a_epi16(psi_r_p1_m1, a_r_p1_m1, psi_i_p1_m1, a_i_p1_m1, psi_a_p1_m1);
    prodsum_psi_a_epi16(psi_r_p1_m3, a_r_p1_m3, psi_i_p1_m3, a_i_p1_m3, psi_a_p1_m3);
    prodsum_psi_a_epi16(psi_r_p1_m5, a_r_p1_m5, psi_i_p1_m5, a_i_p1_m5, psi_a_p1_m5);
    prodsum_psi_a_epi16(psi_r_p1_m7, a_r_p1_m7, psi_i_p1_m7, a_i_p1_m7, psi_a_p1_m7);
    prodsum_psi_a_epi16(psi_r_m1_p7, a_r_m1_p7, psi_i_m1_p7, a_i_m1_p7, psi_a_m1_p7);
    prodsum_psi_a_epi16(psi_r_m1_p5, a_r_m1_p5, psi_i_m1_p5, a_i_m1_p5, psi_a_m1_p5);
    prodsum_psi_a_epi16(psi_r_m1_p3, a_r_m1_p3, psi_i_m1_p3, a_i_m1_p3, psi_a_m1_p3);
    prodsum_psi_a_epi16(psi_r_m1_p1, a_r_m1_p1, psi_i_m1_p1, a_i_m1_p1, psi_a_m1_p1);
    prodsum_psi_a_epi16(psi_r_m1_m1, a_r_m1_m1, psi_i_m1_m1, a_i_m1_m1, psi_a_m1_m1);
    prodsum_psi_a_epi16(psi_r_m1_m3, a_r_m1_m3, psi_i_m1_m3, a_i_m1_m3, psi_a_m1_m3);
    prodsum_psi_a_epi16(psi_r_m1_m5, a_r_m1_m5, psi_i_m1_m5, a_i_m1_m5, psi_a_m1_m5);
    prodsum_psi_a_epi16(psi_r_m1_m7, a_r_m1_m7, psi_i_m1_m7, a_i_m1_m7, psi_a_m1_m7);
    prodsum_psi_a_epi16(psi_r_m3_p7, a_r_m3_p7, psi_i_m3_p7, a_i_m3_p7, psi_a_m3_p7);
    prodsum_psi_a_epi16(psi_r_m3_p5, a_r_m3_p5, psi_i_m3_p5, a_i_m3_p5, psi_a_m3_p5);
    prodsum_psi_a_epi16(psi_r_m3_p3, a_r_m3_p3, psi_i_m3_p3, a_i_m3_p3, psi_a_m3_p3);
    prodsum_psi_a_epi16(psi_r_m3_p1, a_r_m3_p1, psi_i_m3_p1, a_i_m3_p1, psi_a_m3_p1);
    prodsum_psi_a_epi16(psi_r_m3_m1, a_r_m3_m1, psi_i_m3_m1, a_i_m3_m1, psi_a_m3_m1);
    prodsum_psi_a_epi16(psi_r_m3_m3, a_r_m3_m3, psi_i_m3_m3, a_i_m3_m3, psi_a_m3_m3);
    prodsum_psi_a_epi16(psi_r_m3_m5, a_r_m3_m5, psi_i_m3_m5, a_i_m3_m5, psi_a_m3_m5);
    prodsum_psi_a_epi16(psi_r_m3_m7, a_r_m3_m7, psi_i_m3_m7, a_i_m3_m7, psi_a_m3_m7);
    prodsum_psi_a_epi16(psi_r_m5_p7, a_r_m5_p7, psi_i_m5_p7, a_i_m5_p7, psi_a_m5_p7);
    prodsum_psi_a_epi16(psi_r_m5_p5, a_r_m5_p5, psi_i_m5_p5, a_i_m5_p5, psi_a_m5_p5);
    prodsum_psi_a_epi16(psi_r_m5_p3, a_r_m5_p3, psi_i_m5_p3, a_i_m5_p3, psi_a_m5_p3);
    prodsum_psi_a_epi16(psi_r_m5_p1, a_r_m5_p1, psi_i_m5_p1, a_i_m5_p1, psi_a_m5_p1);
    prodsum_psi_a_epi16(psi_r_m5_m1, a_r_m5_m1, psi_i_m5_m1, a_i_m5_m1, psi_a_m5_m1);
    prodsum_psi_a_epi16(psi_r_m5_m3, a_r_m5_m3, psi_i_m5_m3, a_i_m5_m3, psi_a_m5_m3);
    prodsum_psi_a_epi16(psi_r_m5_m5, a_r_m5_m5, psi_i_m5_m5, a_i_m5_m5, psi_a_m5_m5);
    prodsum_psi_a_epi16(psi_r_m5_m7, a_r_m5_m7, psi_i_m5_m7, a_i_m5_m7, psi_a_m5_m7);
    prodsum_psi_a_epi16(psi_r_m7_p7, a_r_m7_p7, psi_i_m7_p7, a_i_m7_p7, psi_a_m7_p7);
    prodsum_psi_a_epi16(psi_r_m7_p5, a_r_m7_p5, psi_i_m7_p5, a_i_m7_p5, psi_a_m7_p5);
    prodsum_psi_a_epi16(psi_r_m7_p3, a_r_m7_p3, psi_i_m7_p3, a_i_m7_p3, psi_a_m7_p3);
    prodsum_psi_a_epi16(psi_r_m7_p1, a_r_m7_p1, psi_i_m7_p1, a_i_m7_p1, psi_a_m7_p1);
    prodsum_psi_a_epi16(psi_r_m7_m1, a_r_m7_m1, psi_i_m7_m1, a_i_m7_m1, psi_a_m7_m1);
    prodsum_psi_a_epi16(psi_r_m7_m3, a_r_m7_m3, psi_i_m7_m3, a_i_m7_m3, psi_a_m7_m3);
    prodsum_psi_a_epi16(psi_r_m7_m5, a_r_m7_m5, psi_i_m7_m5, a_i_m7_m5, psi_a_m7_m5);
    prodsum_psi_a_epi16(psi_r_m7_m7, a_r_m7_m7, psi_i_m7_m7, a_i_m7_m7, psi_a_m7_m7);

    // Multiply by sqrt(2)
    psi_a_p7_p7 = _mm256_mulhi_epi16(psi_a_p7_p7, ONE_OVER_SQRT_2);
    psi_a_p7_p7 = _mm256_slli_epi16(psi_a_p7_p7, 2);
    psi_a_p7_p5 = _mm256_mulhi_epi16(psi_a_p7_p5, ONE_OVER_SQRT_2);
    psi_a_p7_p5 = _mm256_slli_epi16(psi_a_p7_p5, 2);
    psi_a_p7_p3 = _mm256_mulhi_epi16(psi_a_p7_p3, ONE_OVER_SQRT_2);
    psi_a_p7_p3 = _mm256_slli_epi16(psi_a_p7_p3, 2);
    psi_a_p7_p1 = _mm256_mulhi_epi16(psi_a_p7_p1, ONE_OVER_SQRT_2);
    psi_a_p7_p1 = _mm256_slli_epi16(psi_a_p7_p1, 2);
    psi_a_p7_m1 = _mm256_mulhi_epi16(psi_a_p7_m1, ONE_OVER_SQRT_2);
    psi_a_p7_m1 = _mm256_slli_epi16(psi_a_p7_m1, 2);
    psi_a_p7_m3 = _mm256_mulhi_epi16(psi_a_p7_m3, ONE_OVER_SQRT_2);
    psi_a_p7_m3 = _mm256_slli_epi16(psi_a_p7_m3, 2);
    psi_a_p7_m5 = _mm256_mulhi_epi16(psi_a_p7_m5, ONE_OVER_SQRT_2);
    psi_a_p7_m5 = _mm256_slli_epi16(psi_a_p7_m5, 2);
    psi_a_p7_m7 = _mm256_mulhi_epi16(psi_a_p7_m7, ONE_OVER_SQRT_2);
    psi_a_p7_m7 = _mm256_slli_epi16(psi_a_p7_m7, 2);
    psi_a_p5_p7 = _mm256_mulhi_epi16(psi_a_p5_p7, ONE_OVER_SQRT_2);
    psi_a_p5_p7 = _mm256_slli_epi16(psi_a_p5_p7, 2);
    psi_a_p5_p5 = _mm256_mulhi_epi16(psi_a_p5_p5, ONE_OVER_SQRT_2);
    psi_a_p5_p5 = _mm256_slli_epi16(psi_a_p5_p5, 2);
    psi_a_p5_p3 = _mm256_mulhi_epi16(psi_a_p5_p3, ONE_OVER_SQRT_2);
    psi_a_p5_p3 = _mm256_slli_epi16(psi_a_p5_p3, 2);
    psi_a_p5_p1 = _mm256_mulhi_epi16(psi_a_p5_p1, ONE_OVER_SQRT_2);
    psi_a_p5_p1 = _mm256_slli_epi16(psi_a_p5_p1, 2);
    psi_a_p5_m1 = _mm256_mulhi_epi16(psi_a_p5_m1, ONE_OVER_SQRT_2);
    psi_a_p5_m1 = _mm256_slli_epi16(psi_a_p5_m1, 2);
    psi_a_p5_m3 = _mm256_mulhi_epi16(psi_a_p5_m3, ONE_OVER_SQRT_2);
    psi_a_p5_m3 = _mm256_slli_epi16(psi_a_p5_m3, 2);
    psi_a_p5_m5 = _mm256_mulhi_epi16(psi_a_p5_m5, ONE_OVER_SQRT_2);
    psi_a_p5_m5 = _mm256_slli_epi16(psi_a_p5_m5, 2);
    psi_a_p5_m7 = _mm256_mulhi_epi16(psi_a_p5_m7, ONE_OVER_SQRT_2);
    psi_a_p5_m7 = _mm256_slli_epi16(psi_a_p5_m7, 2);
    psi_a_p3_p7 = _mm256_mulhi_epi16(psi_a_p3_p7, ONE_OVER_SQRT_2);
    psi_a_p3_p7 = _mm256_slli_epi16(psi_a_p3_p7, 2);
    psi_a_p3_p5 = _mm256_mulhi_epi16(psi_a_p3_p5, ONE_OVER_SQRT_2);
    psi_a_p3_p5 = _mm256_slli_epi16(psi_a_p3_p5, 2);
    psi_a_p3_p3 = _mm256_mulhi_epi16(psi_a_p3_p3, ONE_OVER_SQRT_2);
    psi_a_p3_p3 = _mm256_slli_epi16(psi_a_p3_p3, 2);
    psi_a_p3_p1 = _mm256_mulhi_epi16(psi_a_p3_p1, ONE_OVER_SQRT_2);
    psi_a_p3_p1 = _mm256_slli_epi16(psi_a_p3_p1, 2);
    psi_a_p3_m1 = _mm256_mulhi_epi16(psi_a_p3_m1, ONE_OVER_SQRT_2);
    psi_a_p3_m1 = _mm256_slli_epi16(psi_a_p3_m1, 2);
    psi_a_p3_m3 = _mm256_mulhi_epi16(psi_a_p3_m3, ONE_OVER_SQRT_2);
    psi_a_p3_m3 = _mm256_slli_epi16(psi_a_p3_m3, 2);
    psi_a_p3_m5 = _mm256_mulhi_epi16(psi_a_p3_m5, ONE_OVER_SQRT_2);
    psi_a_p3_m5 = _mm256_slli_epi16(psi_a_p3_m5, 2);
    psi_a_p3_m7 = _mm256_mulhi_epi16(psi_a_p3_m7, ONE_OVER_SQRT_2);
    psi_a_p3_m7 = _mm256_slli_epi16(psi_a_p3_m7, 2);
    psi_a_p1_p7 = _mm256_mulhi_epi16(psi_a_p1_p7, ONE_OVER_SQRT_2);
    psi_a_p1_p7 = _mm256_slli_epi16(psi_a_p1_p7, 2);
    psi_a_p1_p5 = _mm256_mulhi_epi16(psi_a_p1_p5, ONE_OVER_SQRT_2);
    psi_a_p1_p5 = _mm256_slli_epi16(psi_a_p1_p5, 2);
    psi_a_p1_p3 = _mm256_mulhi_epi16(psi_a_p1_p3, ONE_OVER_SQRT_2);
    psi_a_p1_p3 = _mm256_slli_epi16(psi_a_p1_p3, 2);
    psi_a_p1_p1 = _mm256_mulhi_epi16(psi_a_p1_p1, ONE_OVER_SQRT_2);
    psi_a_p1_p1 = _mm256_slli_epi16(psi_a_p1_p1, 2);
    psi_a_p1_m1 = _mm256_mulhi_epi16(psi_a_p1_m1, ONE_OVER_SQRT_2);
    psi_a_p1_m1 = _mm256_slli_epi16(psi_a_p1_m1, 2);
    psi_a_p1_m3 = _mm256_mulhi_epi16(psi_a_p1_m3, ONE_OVER_SQRT_2);
    psi_a_p1_m3 = _mm256_slli_epi16(psi_a_p1_m3, 2);
    psi_a_p1_m5 = _mm256_mulhi_epi16(psi_a_p1_m5, ONE_OVER_SQRT_2);
    psi_a_p1_m5 = _mm256_slli_epi16(psi_a_p1_m5, 2);
    psi_a_p1_m7 = _mm256_mulhi_epi16(psi_a_p1_m7, ONE_OVER_SQRT_2);
    psi_a_p1_m7 = _mm256_slli_epi16(psi_a_p1_m7, 2);
    psi_a_m1_p7 = _mm256_mulhi_epi16(psi_a_m1_p7, ONE_OVER_SQRT_2);
    psi_a_m1_p7 = _mm256_slli_epi16(psi_a_m1_p7, 2);
    psi_a_m1_p5 = _mm256_mulhi_epi16(psi_a_m1_p5, ONE_OVER_SQRT_2);
    psi_a_m1_p5 = _mm256_slli_epi16(psi_a_m1_p5, 2);
    psi_a_m1_p3 = _mm256_mulhi_epi16(psi_a_m1_p3, ONE_OVER_SQRT_2);
    psi_a_m1_p3 = _mm256_slli_epi16(psi_a_m1_p3, 2);
    psi_a_m1_p1 = _mm256_mulhi_epi16(psi_a_m1_p1, ONE_OVER_SQRT_2);
    psi_a_m1_p1 = _mm256_slli_epi16(psi_a_m1_p1, 2);
    psi_a_m1_m1 = _mm256_mulhi_epi16(psi_a_m1_m1, ONE_OVER_SQRT_2);
    psi_a_m1_m1 = _mm256_slli_epi16(psi_a_m1_m1, 2);
    psi_a_m1_m3 = _mm256_mulhi_epi16(psi_a_m1_m3, ONE_OVER_SQRT_2);
    psi_a_m1_m3 = _mm256_slli_epi16(psi_a_m1_m3, 2);
    psi_a_m1_m5 = _mm256_mulhi_epi16(psi_a_m1_m5, ONE_OVER_SQRT_2);
    psi_a_m1_m5 = _mm256_slli_epi16(psi_a_m1_m5, 2);
    psi_a_m1_m7 = _mm256_mulhi_epi16(psi_a_m1_m7, ONE_OVER_SQRT_2);
    psi_a_m1_m7 = _mm256_slli_epi16(psi_a_m1_m7, 2);
    psi_a_m3_p7 = _mm256_mulhi_epi16(psi_a_m3_p7, ONE_OVER_SQRT_2);
    psi_a_m3_p7 = _mm256_slli_epi16(psi_a_m3_p7, 2);
    psi_a_m3_p5 = _mm256_mulhi_epi16(psi_a_m3_p5, ONE_OVER_SQRT_2);
    psi_a_m3_p5 = _mm256_slli_epi16(psi_a_m3_p5, 2);
    psi_a_m3_p3 = _mm256_mulhi_epi16(psi_a_m3_p3, ONE_OVER_SQRT_2);
    psi_a_m3_p3 = _mm256_slli_epi16(psi_a_m3_p3, 2);
    psi_a_m3_p1 = _mm256_mulhi_epi16(psi_a_m3_p1, ONE_OVER_SQRT_2);
    psi_a_m3_p1 = _mm256_slli_epi16(psi_a_m3_p1, 2);
    psi_a_m3_m1 = _mm256_mulhi_epi16(psi_a_m3_m1, ONE_OVER_SQRT_2);
    psi_a_m3_m1 = _mm256_slli_epi16(psi_a_m3_m1, 2);
    psi_a_m3_m3 = _mm256_mulhi_epi16(psi_a_m3_m3, ONE_OVER_SQRT_2);
    psi_a_m3_m3 = _mm256_slli_epi16(psi_a_m3_m3, 2);
    psi_a_m3_m5 = _mm256_mulhi_epi16(psi_a_m3_m5, ONE_OVER_SQRT_2);
    psi_a_m3_m5 = _mm256_slli_epi16(psi_a_m3_m5, 2);
    psi_a_m3_m7 = _mm256_mulhi_epi16(psi_a_m3_m7, ONE_OVER_SQRT_2);
    psi_a_m3_m7 = _mm256_slli_epi16(psi_a_m3_m7, 2);
    psi_a_m5_p7 = _mm256_mulhi_epi16(psi_a_m5_p7, ONE_OVER_SQRT_2);
    psi_a_m5_p7 = _mm256_slli_epi16(psi_a_m5_p7, 2);
    psi_a_m5_p5 = _mm256_mulhi_epi16(psi_a_m5_p5, ONE_OVER_SQRT_2);
    psi_a_m5_p5 = _mm256_slli_epi16(psi_a_m5_p5, 2);
    psi_a_m5_p3 = _mm256_mulhi_epi16(psi_a_m5_p3, ONE_OVER_SQRT_2);
    psi_a_m5_p3 = _mm256_slli_epi16(psi_a_m5_p3, 2);
    psi_a_m5_p1 = _mm256_mulhi_epi16(psi_a_m5_p1, ONE_OVER_SQRT_2);
    psi_a_m5_p1 = _mm256_slli_epi16(psi_a_m5_p1, 2);
    psi_a_m5_m1 = _mm256_mulhi_epi16(psi_a_m5_m1, ONE_OVER_SQRT_2);
    psi_a_m5_m1 = _mm256_slli_epi16(psi_a_m5_m1, 2);
    psi_a_m5_m3 = _mm256_mulhi_epi16(psi_a_m5_m3, ONE_OVER_SQRT_2);
    psi_a_m5_m3 = _mm256_slli_epi16(psi_a_m5_m3, 2);
    psi_a_m5_m5 = _mm256_mulhi_epi16(psi_a_m5_m5, ONE_OVER_SQRT_2);
    psi_a_m5_m5 = _mm256_slli_epi16(psi_a_m5_m5, 2);
    psi_a_m5_m7 = _mm256_mulhi_epi16(psi_a_m5_m7, ONE_OVER_SQRT_2);
    psi_a_m5_m7 = _mm256_slli_epi16(psi_a_m5_m7, 2);
    psi_a_m7_p7 = _mm256_mulhi_epi16(psi_a_m7_p7, ONE_OVER_SQRT_2);
    psi_a_m7_p7 = _mm256_slli_epi16(psi_a_m7_p7, 2);
    psi_a_m7_p5 = _mm256_mulhi_epi16(psi_a_m7_p5, ONE_OVER_SQRT_2);
    psi_a_m7_p5 = _mm256_slli_epi16(psi_a_m7_p5, 2);
    psi_a_m7_p3 = _mm256_mulhi_epi16(psi_a_m7_p3, ONE_OVER_SQRT_2);
    psi_a_m7_p3 = _mm256_slli_epi16(psi_a_m7_p3, 2);
    psi_a_m7_p1 = _mm256_mulhi_epi16(psi_a_m7_p1, ONE_OVER_SQRT_2);
    psi_a_m7_p1 = _mm256_slli_epi16(psi_a_m7_p1, 2);
    psi_a_m7_m1 = _mm256_mulhi_epi16(psi_a_m7_m1, ONE_OVER_SQRT_2);
    psi_a_m7_m1 = _mm256_slli_epi16(psi_a_m7_m1, 2);
    psi_a_m7_m3 = _mm256_mulhi_epi16(psi_a_m7_m3, ONE_OVER_SQRT_2);
    psi_a_m7_m3 = _mm256_slli_epi16(psi_a_m7_m3, 2);
    psi_a_m7_m5 = _mm256_mulhi_epi16(psi_a_m7_m5, ONE_OVER_SQRT_2);
    psi_a_m7_m5 = _mm256_slli_epi16(psi_a_m7_m5, 2);
    psi_a_m7_m7 = _mm256_mulhi_epi16(psi_a_m7_m7, ONE_OVER_SQRT_2);
    psi_a_m7_m7 = _mm256_slli_epi16(psi_a_m7_m7, 2);

    // Calculation of a group of two terms in the bit metric involving squares of interference
    square_a_64qam_epi16(a_r_p7_p7, a_i_p7_p7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p7_p7);
    square_a_64qam_epi16(a_r_p7_p5, a_i_p7_p5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p7_p5);
    square_a_64qam_epi16(a_r_p7_p3, a_i_p7_p3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p7_p3);
    square_a_64qam_epi16(a_r_p7_p1, a_i_p7_p1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p7_p1);
    square_a_64qam_epi16(a_r_p7_m1, a_i_p7_m1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p7_m1);
    square_a_64qam_epi16(a_r_p7_m3, a_i_p7_m3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p7_m3);
    square_a_64qam_epi16(a_r_p7_m5, a_i_p7_m5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p7_m5);
    square_a_64qam_epi16(a_r_p7_m7, a_i_p7_m7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p7_m7);
    square_a_64qam_epi16(a_r_p5_p7, a_i_p5_p7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p5_p7);
    square_a_64qam_epi16(a_r_p5_p5, a_i_p5_p5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p5_p5);
    square_a_64qam_epi16(a_r_p5_p3, a_i_p5_p3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p5_p3);
    square_a_64qam_epi16(a_r_p5_p1, a_i_p5_p1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p5_p1);
    square_a_64qam_epi16(a_r_p5_m1, a_i_p5_m1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p5_m1);
    square_a_64qam_epi16(a_r_p5_m3, a_i_p5_m3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p5_m3);
    square_a_64qam_epi16(a_r_p5_m5, a_i_p5_m5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p5_m5);
    square_a_64qam_epi16(a_r_p5_m7, a_i_p5_m7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p5_m7);
    square_a_64qam_epi16(a_r_p3_p7, a_i_p3_p7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p3_p7);
    square_a_64qam_epi16(a_r_p3_p5, a_i_p3_p5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p3_p5);
    square_a_64qam_epi16(a_r_p3_p3, a_i_p3_p3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p3_p3);
    square_a_64qam_epi16(a_r_p3_p1, a_i_p3_p1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p3_p1);
    square_a_64qam_epi16(a_r_p3_m1, a_i_p3_m1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p3_m1);
    square_a_64qam_epi16(a_r_p3_m3, a_i_p3_m3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p3_m3);
    square_a_64qam_epi16(a_r_p3_m5, a_i_p3_m5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p3_m5);
    square_a_64qam_epi16(a_r_p3_m7, a_i_p3_m7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p3_m7);
    square_a_64qam_epi16(a_r_p1_p7, a_i_p1_p7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p1_p7);
    square_a_64qam_epi16(a_r_p1_p5, a_i_p1_p5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p1_p5);
    square_a_64qam_epi16(a_r_p1_p3, a_i_p1_p3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p1_p3);
    square_a_64qam_epi16(a_r_p1_p1, a_i_p1_p1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p1_p1);
    square_a_64qam_epi16(a_r_p1_m1, a_i_p1_m1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p1_m1);
    square_a_64qam_epi16(a_r_p1_m3, a_i_p1_m3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p1_m3);
    square_a_64qam_epi16(a_r_p1_m5, a_i_p1_m5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p1_m5);
    square_a_64qam_epi16(a_r_p1_m7, a_i_p1_m7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p1_m7);
    square_a_64qam_epi16(a_r_m1_p7, a_i_m1_p7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m1_p7);
    square_a_64qam_epi16(a_r_m1_p5, a_i_m1_p5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m1_p5);
    square_a_64qam_epi16(a_r_m1_p3, a_i_m1_p3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m1_p3);
    square_a_64qam_epi16(a_r_m1_p1, a_i_m1_p1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m1_p1);
    square_a_64qam_epi16(a_r_m1_m1, a_i_m1_m1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m1_m1);
    square_a_64qam_epi16(a_r_m1_m3, a_i_m1_m3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m1_m3);
    square_a_64qam_epi16(a_r_m1_m5, a_i_m1_m5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m1_m5);
    square_a_64qam_epi16(a_r_m1_m7, a_i_m1_m7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m1_m7);
    square_a_64qam_epi16(a_r_m3_p7, a_i_m3_p7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m3_p7);
    square_a_64qam_epi16(a_r_m3_p5, a_i_m3_p5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m3_p5);
    square_a_64qam_epi16(a_r_m3_p3, a_i_m3_p3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m3_p3);
    square_a_64qam_epi16(a_r_m3_p1, a_i_m3_p1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m3_p1);
    square_a_64qam_epi16(a_r_m3_m1, a_i_m3_m1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m3_m1);
    square_a_64qam_epi16(a_r_m3_m3, a_i_m3_m3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m3_m3);
    square_a_64qam_epi16(a_r_m3_m5, a_i_m3_m5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m3_m5);
    square_a_64qam_epi16(a_r_m3_m7, a_i_m3_m7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m3_m7);
    square_a_64qam_epi16(a_r_m5_p7, a_i_m5_p7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m5_p7);
    square_a_64qam_epi16(a_r_m5_p5, a_i_m5_p5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m5_p5);
    square_a_64qam_epi16(a_r_m5_p3, a_i_m5_p3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m5_p3);
    square_a_64qam_epi16(a_r_m5_p1, a_i_m5_p1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m5_p1);
    square_a_64qam_epi16(a_r_m5_m1, a_i_m5_m1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m5_m1);
    square_a_64qam_epi16(a_r_m5_m3, a_i_m5_m3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m5_m3);
    square_a_64qam_epi16(a_r_m5_m5, a_i_m5_m5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m5_m5);
    square_a_64qam_epi16(a_r_m5_m7, a_i_m5_m7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m5_m7);
    square_a_64qam_epi16(a_r_m7_p7, a_i_m7_p7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m7_p7);
    square_a_64qam_epi16(a_r_m7_p5, a_i_m7_p5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m7_p5);
    square_a_64qam_epi16(a_r_m7_p3, a_i_m7_p3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m7_p3);
    square_a_64qam_epi16(a_r_m7_p1, a_i_m7_p1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m7_p1);
    square_a_64qam_epi16(a_r_m7_m1, a_i_m7_m1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m7_m1);
    square_a_64qam_epi16(a_r_m7_m3, a_i_m7_m3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m7_m3);
    square_a_64qam_epi16(a_r_m7_m5, a_i_m7_m5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m7_m5);
    square_a_64qam_epi16(a_r_m7_m7, a_i_m7_m7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m7_m7);

    // Computing different multiples of ||h0||^2
    // x=1, y=1
    ch_mag_2_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,ONE_OVER_FOUR_SQRT_42);
    ch_mag_2_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_2_over_42_with_sigma2,1);
    // x=1, y=3
    ch_mag_10_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,FIVE_OVER_FOUR_SQRT_42);
    ch_mag_10_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_10_over_42_with_sigma2,1);
    // x=1, x=5
    ch_mag_26_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,THIRTEEN_OVER_FOUR_SQRT_42);
    ch_mag_26_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_26_over_42_with_sigma2,1);
    // x=1, y=7
    ch_mag_50_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,TWENTYFIVE_OVER_FOUR_SQRT_42);
    ch_mag_50_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_50_over_42_with_sigma2,1);
    // x=3, y=3
    ch_mag_18_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,NINE_OVER_FOUR_SQRT_42);
    ch_mag_18_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_18_over_42_with_sigma2,1);
    // x=3, y=5
    ch_mag_34_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,SEVENTEEN_OVER_FOUR_SQRT_42);
    ch_mag_34_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_34_over_42_with_sigma2,1);
    // x=3, y=7
    ch_mag_58_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,TWENTYNINE_OVER_FOUR_SQRT_42);
    ch_mag_58_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_58_over_42_with_sigma2,2);
    // x=5, y=5
    ch_mag_50_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,TWENTYFIVE_OVER_FOUR_SQRT_42);
    ch_mag_50_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_50_over_42_with_sigma2,1);
    // x=5, y=7
    ch_mag_74_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,THIRTYSEVEN_OVER_FOUR_SQRT_42);
    ch_mag_74_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_74_over_42_with_sigma2,2);
    // x=7, y=7
    ch_mag_98_over_42_with_sigma2 = _mm256_mulhi_epi16(ch_mag_des,FORTYNINE_OVER_FOUR_SQRT_42);
    ch_mag_98_over_42_with_sigma2 = _mm256_slli_epi16(ch_mag_98_over_42_with_sigma2,2);

    // Computing Metrics
    xmm0 = _mm256_subs_epi16(psi_a_p7_p7, a_sq_p7_p7);
    xmm1 = _mm256_adds_epi16(xmm0, y0_p_7_7);
    bit_met_p7_p7 = _mm256_subs_epi16(xmm1, ch_mag_98_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p7_p5, a_sq_p7_p5);
    xmm1 = _mm256_adds_epi16(xmm0, y0_p_7_5);
    bit_met_p7_p5 = _mm256_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p7_p3, a_sq_p7_p3);
    xmm1 = _mm256_adds_epi16(xmm0, y0_p_7_3);
    bit_met_p7_p3 = _mm256_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p7_p1, a_sq_p7_p1);
    xmm1 = _mm256_adds_epi16(xmm0, y0_p_7_1);
    bit_met_p7_p1 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p7_m1, a_sq_p7_m1);
    xmm1 = _mm256_adds_epi16(xmm0, y0_m_7_1);
    bit_met_p7_m1 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p7_m3, a_sq_p7_m3);
    xmm1 = _mm256_adds_epi16(xmm0, y0_m_7_3);
    bit_met_p7_m3 = _mm256_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p7_m5, a_sq_p7_m5);
    xmm1 = _mm256_adds_epi16(xmm0, y0_m_7_5);
    bit_met_p7_m5 = _mm256_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p7_m7, a_sq_p7_m7);
    xmm1 = _mm256_adds_epi16(xmm0, y0_m_7_7);
    bit_met_p7_m7 = _mm256_subs_epi16(xmm1, ch_mag_98_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p5_p7, a_sq_p5_p7);
    xmm1 = _mm256_adds_epi16(xmm0, y0_p_5_7);
    bit_met_p5_p7 = _mm256_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p5_p5, a_sq_p5_p5);
    xmm1 = _mm256_adds_epi16(xmm0, y0_p_5_5);
    bit_met_p5_p5 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p5_p3, a_sq_p5_p3);
    xmm1 = _mm256_adds_epi16(xmm0, y0_p_5_3);
    bit_met_p5_p3 = _mm256_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p5_p1, a_sq_p5_p1);
    xmm1 = _mm256_adds_epi16(xmm0, y0_p_5_1);
    bit_met_p5_p1 = _mm256_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p5_m1, a_sq_p5_m1);
    xmm1 = _mm256_adds_epi16(xmm0, y0_m_5_1);
    bit_met_p5_m1 = _mm256_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p5_m3, a_sq_p5_m3);
    xmm1 = _mm256_adds_epi16(xmm0, y0_m_5_3);
    bit_met_p5_m3 = _mm256_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p5_m5, a_sq_p5_m5);
    xmm1 = _mm256_adds_epi16(xmm0, y0_m_5_5);
    bit_met_p5_m5 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p5_m7, a_sq_p5_m7);
    xmm1 = _mm256_adds_epi16(xmm0, y0_m_5_7);
    bit_met_p5_m7 = _mm256_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p3_p7, a_sq_p3_p7);
    xmm1 = _mm256_adds_epi16(xmm0, y0_p_3_7);
    bit_met_p3_p7 = _mm256_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p3_p5, a_sq_p3_p5);
    xmm1 = _mm256_adds_epi16(xmm0, y0_p_3_5);
    bit_met_p3_p5 = _mm256_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p3_p3, a_sq_p3_p3);
    xmm1 = _mm256_adds_epi16(xmm0, y0_p_3_3);
    bit_met_p3_p3 = _mm256_subs_epi16(xmm1, ch_mag_18_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p3_p1, a_sq_p3_p1);
    xmm1 = _mm256_adds_epi16(xmm0, y0_p_3_1);
    bit_met_p3_p1 = _mm256_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p3_m1, a_sq_p3_m1);
    xmm1 = _mm256_adds_epi16(xmm0, y0_m_3_1);
    bit_met_p3_m1 = _mm256_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p3_m3, a_sq_p3_m3);
    xmm1 = _mm256_adds_epi16(xmm0, y0_m_3_3);
    bit_met_p3_m3 = _mm256_subs_epi16(xmm1, ch_mag_18_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p3_m5, a_sq_p3_m5);
    xmm1 = _mm256_adds_epi16(xmm0, y0_m_3_5);
    bit_met_p3_m5 = _mm256_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p3_m7, a_sq_p3_m7);
    xmm1 = _mm256_adds_epi16(xmm0, y0_m_3_7);
    bit_met_p3_m7 = _mm256_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p1_p7, a_sq_p1_p7);
    xmm1 = _mm256_adds_epi16(xmm0, y0_p_1_7);
    bit_met_p1_p7 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p1_p5, a_sq_p1_p5);
    xmm1 = _mm256_adds_epi16(xmm0, y0_p_1_5);
    bit_met_p1_p5 = _mm256_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p1_p3, a_sq_p1_p3);
    xmm1 = _mm256_adds_epi16(xmm0, y0_p_1_3);
    bit_met_p1_p3 = _mm256_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p1_p1, a_sq_p1_p1);
    xmm1 = _mm256_adds_epi16(xmm0, y0_p_1_1);
    bit_met_p1_p1 = _mm256_subs_epi16(xmm1, ch_mag_2_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p1_m1, a_sq_p1_m1);
    xmm1 = _mm256_adds_epi16(xmm0, y0_m_1_1);
    bit_met_p1_m1 = _mm256_subs_epi16(xmm1, ch_mag_2_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p1_m3, a_sq_p1_m3);
    xmm1 = _mm256_adds_epi16(xmm0, y0_m_1_3);
    bit_met_p1_m3 = _mm256_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p1_m5, a_sq_p1_m5);
    xmm1 = _mm256_adds_epi16(xmm0, y0_m_1_5);
    bit_met_p1_m5 = _mm256_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_p1_m7, a_sq_p1_m7);
    xmm1 = _mm256_adds_epi16(xmm0, y0_m_1_7);
    bit_met_p1_m7 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);

    xmm0 = _mm256_subs_epi16(psi_a_m1_p7, a_sq_m1_p7);
    xmm1 = _mm256_subs_epi16(xmm0, y0_m_1_7);
    bit_met_m1_p7 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m1_p5, a_sq_m1_p5);
    xmm1 = _mm256_subs_epi16(xmm0, y0_m_1_5);
    bit_met_m1_p5 = _mm256_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m1_p3, a_sq_m1_p3);
    xmm1 = _mm256_subs_epi16(xmm0, y0_m_1_3);
    bit_met_m1_p3 = _mm256_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m1_p1, a_sq_m1_p1);
    xmm1 = _mm256_subs_epi16(xmm0, y0_m_1_1);
    bit_met_m1_p1 = _mm256_subs_epi16(xmm1, ch_mag_2_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m1_m1, a_sq_m1_m1);
    xmm1 = _mm256_subs_epi16(xmm0, y0_p_1_1);
    bit_met_m1_m1 = _mm256_subs_epi16(xmm1, ch_mag_2_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m1_m3, a_sq_m1_m3);
    xmm1 = _mm256_subs_epi16(xmm0, y0_p_1_3);
    bit_met_m1_m3 = _mm256_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m1_m5, a_sq_m1_m5);
    xmm1 = _mm256_subs_epi16(xmm0, y0_p_1_5);
    bit_met_m1_m5 = _mm256_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m1_m7, a_sq_m1_m7);
    xmm1 = _mm256_subs_epi16(xmm0, y0_p_1_7);
    bit_met_m1_m7 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m3_p7, a_sq_m3_p7);
    xmm1 = _mm256_subs_epi16(xmm0, y0_m_3_7);
    bit_met_m3_p7 = _mm256_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m3_p5, a_sq_m3_p5);
    xmm1 = _mm256_subs_epi16(xmm0, y0_m_3_5);
    bit_met_m3_p5 = _mm256_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m3_p3, a_sq_m3_p3);
    xmm1 = _mm256_subs_epi16(xmm0, y0_m_3_3);
    bit_met_m3_p3 = _mm256_subs_epi16(xmm1, ch_mag_18_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m3_p1, a_sq_m3_p1);
    xmm1 = _mm256_subs_epi16(xmm0, y0_m_3_1);
    bit_met_m3_p1 = _mm256_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m3_m1, a_sq_m3_m1);
    xmm1 = _mm256_subs_epi16(xmm0, y0_p_3_1);
    bit_met_m3_m1 = _mm256_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m3_m3, a_sq_m3_m3);
    xmm1 = _mm256_subs_epi16(xmm0, y0_p_3_3);
    bit_met_m3_m3 = _mm256_subs_epi16(xmm1, ch_mag_18_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m3_m5, a_sq_m3_m5);
    xmm1 = _mm256_subs_epi16(xmm0, y0_p_3_5);
    bit_met_m3_m5 = _mm256_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m3_m7, a_sq_m3_m7);
    xmm1 = _mm256_subs_epi16(xmm0, y0_p_3_7);
    bit_met_m3_m7 = _mm256_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m5_p7, a_sq_m5_p7);
    xmm1 = _mm256_subs_epi16(xmm0, y0_m_5_7);
    bit_met_m5_p7 = _mm256_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m5_p5, a_sq_m5_p5);
    xmm1 = _mm256_subs_epi16(xmm0, y0_m_5_5);
    bit_met_m5_p5 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m5_p3, a_sq_m5_p3);
    xmm1 = _mm256_subs_epi16(xmm0, y0_m_5_3);
    bit_met_m5_p3 = _mm256_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m5_p1, a_sq_m5_p1);
    xmm1 = _mm256_subs_epi16(xmm0, y0_m_5_1);
    bit_met_m5_p1 = _mm256_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m5_m1, a_sq_m5_m1);
    xmm1 = _mm256_subs_epi16(xmm0, y0_p_5_1);
    bit_met_m5_m1 = _mm256_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m5_m3, a_sq_m5_m3);
    xmm1 = _mm256_subs_epi16(xmm0, y0_p_5_3);
    bit_met_m5_m3 = _mm256_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m5_m5, a_sq_m5_m5);
    xmm1 = _mm256_subs_epi16(xmm0, y0_p_5_5);
    bit_met_m5_m5 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m5_m7, a_sq_m5_m7);
    xmm1 = _mm256_subs_epi16(xmm0, y0_p_5_7);
    bit_met_m5_m7 = _mm256_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m7_p7, a_sq_m7_p7);
    xmm1 = _mm256_subs_epi16(xmm0, y0_m_7_7);
    bit_met_m7_p7 = _mm256_subs_epi16(xmm1, ch_mag_98_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m7_p5, a_sq_m7_p5);
    xmm1 = _mm256_subs_epi16(xmm0, y0_m_7_5);
    bit_met_m7_p5 = _mm256_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m7_p3, a_sq_m7_p3);
    xmm1 = _mm256_subs_epi16(xmm0, y0_m_7_3);
    bit_met_m7_p3 = _mm256_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m7_p1, a_sq_m7_p1);
    xmm1 = _mm256_subs_epi16(xmm0, y0_m_7_1);
    bit_met_m7_p1 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m7_m1, a_sq_m7_m1);
    xmm1 = _mm256_subs_epi16(xmm0, y0_p_7_1);
    bit_met_m7_m1 = _mm256_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m7_m3, a_sq_m7_m3);
    xmm1 = _mm256_subs_epi16(xmm0, y0_p_7_3);
    bit_met_m7_m3 = _mm256_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m7_m5, a_sq_m7_m5);
    xmm1 = _mm256_subs_epi16(xmm0, y0_p_7_5);
    bit_met_m7_m5 = _mm256_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm0 = _mm256_subs_epi16(psi_a_m7_m7, a_sq_m7_m7);
    xmm1 = _mm256_subs_epi16(xmm0, y0_p_7_7);
    bit_met_m7_m7 = _mm256_subs_epi16(xmm1, ch_mag_98_over_42_with_sigma2);

    // Detection for 1st bit (LTE mapping)
    // bit = 1
    xmm0 = _mm256_max_epi16(bit_met_m7_p7, bit_met_m7_p5);
    xmm1 = _mm256_max_epi16(bit_met_m7_p3, bit_met_m7_p1);
    xmm2 = _mm256_max_epi16(bit_met_m7_m1, bit_met_m7_m3);
    xmm3 = _mm256_max_epi16(bit_met_m7_m5, bit_met_m7_m7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(xmm4, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_m5_p7, bit_met_m5_p5);
    xmm1 = _mm256_max_epi16(bit_met_m5_p3, bit_met_m5_p1);
    xmm2 = _mm256_max_epi16(bit_met_m5_m1, bit_met_m5_m3);
    xmm3 = _mm256_max_epi16(bit_met_m5_m5, bit_met_m5_m7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_m3_p7, bit_met_m3_p5);
    xmm1 = _mm256_max_epi16(bit_met_m3_p3, bit_met_m3_p1);
    xmm2 = _mm256_max_epi16(bit_met_m3_m1, bit_met_m3_m3);
    xmm3 = _mm256_max_epi16(bit_met_m3_m5, bit_met_m3_m7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_m1_p7, bit_met_m1_p5);
    xmm1 = _mm256_max_epi16(bit_met_m1_p3, bit_met_m1_p1);
    xmm2 = _mm256_max_epi16(bit_met_m1_m1, bit_met_m1_m3);
    xmm3 = _mm256_max_epi16(bit_met_m1_m5, bit_met_m1_m7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);

    // bit = 0
    xmm0 = _mm256_max_epi16(bit_met_p7_p7, bit_met_p7_p5);
    xmm1 = _mm256_max_epi16(bit_met_p7_p3, bit_met_p7_p1);
    xmm2 = _mm256_max_epi16(bit_met_p7_m1, bit_met_p7_m3);
    xmm3 = _mm256_max_epi16(bit_met_p7_m5, bit_met_p7_m7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(xmm4, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p5_p7, bit_met_p5_p5);
    xmm1 = _mm256_max_epi16(bit_met_p5_p3, bit_met_p5_p1);
    xmm2 = _mm256_max_epi16(bit_met_p5_m1, bit_met_p5_m3);
    xmm3 = _mm256_max_epi16(bit_met_p5_m5, bit_met_p5_m7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p3_p7, bit_met_p3_p5);
    xmm1 = _mm256_max_epi16(bit_met_p3_p3, bit_met_p3_p1);
    xmm2 = _mm256_max_epi16(bit_met_p3_m1, bit_met_p3_m3);
    xmm3 = _mm256_max_epi16(bit_met_p3_m5, bit_met_p3_m7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p1_p7, bit_met_p1_p5);
    xmm1 = _mm256_max_epi16(bit_met_p1_p3, bit_met_p1_p1);
    xmm2 = _mm256_max_epi16(bit_met_p1_m1, bit_met_p1_m3);
    xmm3 = _mm256_max_epi16(bit_met_p1_m5, bit_met_p1_m7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);

    y0r = _mm256_subs_epi16(logmax_num_re0, logmax_den_re0);

    // Detection for 2nd bit (LTE mapping)
    // bit = 1
    xmm0 = _mm256_max_epi16(bit_met_p7_m1, bit_met_p5_m1);
    xmm1 = _mm256_max_epi16(bit_met_p3_m1, bit_met_p1_m1);
    xmm2 = _mm256_max_epi16(bit_met_m1_m1, bit_met_m3_m1);
    xmm3 = _mm256_max_epi16(bit_met_m5_m1, bit_met_m7_m1);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(xmm4, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_m3, bit_met_p5_m3);
    xmm1 = _mm256_max_epi16(bit_met_p3_m3, bit_met_p1_m3);
    xmm2 = _mm256_max_epi16(bit_met_m1_m3, bit_met_m3_m3);
    xmm3 = _mm256_max_epi16(bit_met_m5_m3, bit_met_m7_m3);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_m5, bit_met_p5_m5);
    xmm1 = _mm256_max_epi16(bit_met_p3_m5, bit_met_p1_m5);
    xmm2 = _mm256_max_epi16(bit_met_m1_m5, bit_met_m3_m5);
    xmm3 = _mm256_max_epi16(bit_met_m5_m5, bit_met_m7_m5);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_m7, bit_met_p5_m7);
    xmm1 = _mm256_max_epi16(bit_met_p3_m7, bit_met_p1_m7);
    xmm2 = _mm256_max_epi16(bit_met_m1_m7, bit_met_m3_m7);
    xmm3 = _mm256_max_epi16(bit_met_m5_m7, bit_met_m7_m7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);

    // bit = 0
    xmm0 = _mm256_max_epi16(bit_met_p7_p1, bit_met_p5_p1);
    xmm1 = _mm256_max_epi16(bit_met_p3_p1, bit_met_p1_p1);
    xmm2 = _mm256_max_epi16(bit_met_m1_p1, bit_met_m3_p1);
    xmm3 = _mm256_max_epi16(bit_met_m5_p1, bit_met_m7_p1);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(xmm4, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_p3, bit_met_p5_p3);
    xmm1 = _mm256_max_epi16(bit_met_p3_p3, bit_met_p1_p3);
    xmm2 = _mm256_max_epi16(bit_met_m1_p3, bit_met_m3_p3);
    xmm3 = _mm256_max_epi16(bit_met_m5_p3, bit_met_m7_p3);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_p5, bit_met_p5_p5);
    xmm1 = _mm256_max_epi16(bit_met_p3_p5, bit_met_p1_p5);
    xmm2 = _mm256_max_epi16(bit_met_m1_p5, bit_met_m3_p5);
    xmm3 = _mm256_max_epi16(bit_met_m5_p5, bit_met_m7_p5);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_p7, bit_met_p5_p7);
    xmm1 = _mm256_max_epi16(bit_met_p3_p7, bit_met_p1_p7);
    xmm2 = _mm256_max_epi16(bit_met_m1_p7, bit_met_m3_p7);
    xmm3 = _mm256_max_epi16(bit_met_m5_p7, bit_met_m7_p7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);

    y1r = _mm256_subs_epi16(logmax_num_re0, logmax_den_re0);

    // Detection for 3rd bit (LTE mapping)
    xmm0 = _mm256_max_epi16(bit_met_m7_m7, bit_met_m7_m5);
    xmm1 = _mm256_max_epi16(bit_met_m7_m3, bit_met_m7_m1);
    xmm2 = _mm256_max_epi16(bit_met_m7_p1, bit_met_m7_p3);
    xmm3 = _mm256_max_epi16(bit_met_m7_p5, bit_met_m7_p7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(xmm4, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_m5_m7, bit_met_m5_m5);
    xmm1 = _mm256_max_epi16(bit_met_m5_m3, bit_met_m5_m1);
    xmm2 = _mm256_max_epi16(bit_met_m5_p1, bit_met_m5_p3);
    xmm3 = _mm256_max_epi16(bit_met_m5_p5, bit_met_m5_p7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p5_m7, bit_met_p5_m5);
    xmm1 = _mm256_max_epi16(bit_met_p5_m3, bit_met_p5_m1);
    xmm2 = _mm256_max_epi16(bit_met_p5_p1, bit_met_p5_p3);
    xmm3 = _mm256_max_epi16(bit_met_p5_p5, bit_met_p5_p7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_m7, bit_met_p7_m5);
    xmm1 = _mm256_max_epi16(bit_met_p7_m3, bit_met_p7_m1);
    xmm2 = _mm256_max_epi16(bit_met_p7_p1, bit_met_p7_p3);
    xmm3 = _mm256_max_epi16(bit_met_p7_p5, bit_met_p7_p7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);

    xmm0 = _mm256_max_epi16(bit_met_m3_m7, bit_met_m3_m5);
    xmm1 = _mm256_max_epi16(bit_met_m3_m3, bit_met_m3_m1);
    xmm2 = _mm256_max_epi16(bit_met_m3_p1, bit_met_m3_p3);
    xmm3 = _mm256_max_epi16(bit_met_m3_p5, bit_met_m3_p7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(xmm4, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_m1_m7, bit_met_m1_m5);
    xmm1 = _mm256_max_epi16(bit_met_m1_m3, bit_met_m1_m1);
    xmm2 = _mm256_max_epi16(bit_met_m1_p1, bit_met_m1_p3);
    xmm3 = _mm256_max_epi16(bit_met_m1_p5, bit_met_m1_p7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p1_m7, bit_met_p1_m5);
    xmm1 = _mm256_max_epi16(bit_met_p1_m3, bit_met_p1_m1);
    xmm2 = _mm256_max_epi16(bit_met_p1_p1, bit_met_p1_p3);
    xmm3 = _mm256_max_epi16(bit_met_p1_p5, bit_met_p1_p7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p3_m7, bit_met_p3_m5);
    xmm1 = _mm256_max_epi16(bit_met_p3_m3, bit_met_p3_m1);
    xmm2 = _mm256_max_epi16(bit_met_p3_p1, bit_met_p3_p3);
    xmm3 = _mm256_max_epi16(bit_met_p3_p5, bit_met_p3_p7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);

    y2r = _mm256_subs_epi16(logmax_num_re0, logmax_den_re0);

    // Detection for 4th bit (LTE mapping)
    xmm0 = _mm256_max_epi16(bit_met_p7_p7, bit_met_p5_p7);
    xmm1 = _mm256_max_epi16(bit_met_p3_p7, bit_met_p1_p7);
    xmm2 = _mm256_max_epi16(bit_met_m1_p7, bit_met_m3_p7);
    xmm3 = _mm256_max_epi16(bit_met_m5_p7, bit_met_m7_p7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(xmm4, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_p5, bit_met_p5_p5);
    xmm1 = _mm256_max_epi16(bit_met_p3_p5, bit_met_p1_p5);
    xmm2 = _mm256_max_epi16(bit_met_m1_p5, bit_met_m3_p5);
    xmm3 = _mm256_max_epi16(bit_met_m5_p5, bit_met_m5_p5);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_m5, bit_met_p5_m5);
    xmm1 = _mm256_max_epi16(bit_met_p3_m5, bit_met_p1_m5);
    xmm2 = _mm256_max_epi16(bit_met_m1_m5, bit_met_m3_m5);
    xmm3 = _mm256_max_epi16(bit_met_m5_m5, bit_met_m7_m5);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_m7, bit_met_p5_m7);
    xmm1 = _mm256_max_epi16(bit_met_p3_m7, bit_met_p1_m7);
    xmm2 = _mm256_max_epi16(bit_met_m1_m7, bit_met_m3_m7);
    xmm3 = _mm256_max_epi16(bit_met_m5_m7, bit_met_m7_m7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);

    xmm0 = _mm256_max_epi16(bit_met_p7_m1, bit_met_p5_m1);
    xmm1 = _mm256_max_epi16(bit_met_p3_m1, bit_met_p1_m1);
    xmm2 = _mm256_max_epi16(bit_met_m1_m1, bit_met_m3_m1);
    xmm3 = _mm256_max_epi16(bit_met_m5_m1, bit_met_m7_m1);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(xmm4, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_m3, bit_met_p5_m3);
    xmm1 = _mm256_max_epi16(bit_met_p3_m3, bit_met_p1_m3);
    xmm2 = _mm256_max_epi16(bit_met_m1_m3, bit_met_m3_m3);
    xmm3 = _mm256_max_epi16(bit_met_m5_m3, bit_met_m7_m3);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_p1, bit_met_p5_p1);
    xmm1 = _mm256_max_epi16(bit_met_p3_p1, bit_met_p1_p1);
    xmm2 = _mm256_max_epi16(bit_met_m1_p1, bit_met_m3_p1);
    xmm3 = _mm256_max_epi16(bit_met_m5_p1, bit_met_m7_p1);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_p3, bit_met_p5_p3);
    xmm1 = _mm256_max_epi16(bit_met_p3_p3, bit_met_p1_p3);
    xmm2 = _mm256_max_epi16(bit_met_m1_p3, bit_met_m3_p3);
    xmm3 = _mm256_max_epi16(bit_met_m5_p3, bit_met_m7_p3);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);

    y0i = _mm256_subs_epi16(logmax_num_re0, logmax_den_re0);


    // Detection for 5th bit (LTE mapping)
    xmm0 = _mm256_max_epi16(bit_met_m7_m7, bit_met_m7_m5);
    xmm1 = _mm256_max_epi16(bit_met_m7_m3, bit_met_m7_m1);
    xmm2 = _mm256_max_epi16(bit_met_m7_p1, bit_met_m7_p3);
    xmm3 = _mm256_max_epi16(bit_met_m7_p5, bit_met_m7_p7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(xmm4, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_m1_m7, bit_met_m1_m5);
    xmm1 = _mm256_max_epi16(bit_met_m1_m3, bit_met_m1_m1);
    xmm2 = _mm256_max_epi16(bit_met_m1_p1, bit_met_m1_p3);
    xmm3 = _mm256_max_epi16(bit_met_m1_p5, bit_met_m1_p7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p1_m7, bit_met_p1_m5);
    xmm1 = _mm256_max_epi16(bit_met_p1_m3, bit_met_p1_m1);
    xmm2 = _mm256_max_epi16(bit_met_p1_p1, bit_met_p1_p3);
    xmm3 = _mm256_max_epi16(bit_met_p1_p5, bit_met_p1_p7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_m7, bit_met_p7_m5);
    xmm1 = _mm256_max_epi16(bit_met_p7_m3, bit_met_p7_m1);
    xmm2 = _mm256_max_epi16(bit_met_p7_p1, bit_met_p7_p3);
    xmm3 = _mm256_max_epi16(bit_met_p7_p5, bit_met_p7_p7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);

    xmm0 = _mm256_max_epi16(bit_met_m5_m7, bit_met_m5_m5);
    xmm1 = _mm256_max_epi16(bit_met_m5_m3, bit_met_m5_m1);
    xmm2 = _mm256_max_epi16(bit_met_m5_p1, bit_met_m5_p3);
    xmm3 = _mm256_max_epi16(bit_met_m5_p5, bit_met_m5_p7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(xmm4, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_m3_m7, bit_met_m3_m5);
    xmm1 = _mm256_max_epi16(bit_met_m3_m3, bit_met_m3_m1);
    xmm2 = _mm256_max_epi16(bit_met_m3_p1, bit_met_m3_p3);
    xmm3 = _mm256_max_epi16(bit_met_m3_p5, bit_met_m3_p7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p3_m7, bit_met_p3_m5);
    xmm1 = _mm256_max_epi16(bit_met_p3_m3, bit_met_p3_m1);
    xmm2 = _mm256_max_epi16(bit_met_p3_p1, bit_met_p3_p3);
    xmm3 = _mm256_max_epi16(bit_met_p3_p5, bit_met_p3_p7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p5_m7, bit_met_p5_m5);
    xmm1 = _mm256_max_epi16(bit_met_p5_m3, bit_met_p5_m1);
    xmm2 = _mm256_max_epi16(bit_met_p5_p1, bit_met_p5_p3);
    xmm3 = _mm256_max_epi16(bit_met_p5_p5, bit_met_p5_p7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);

    y1i = _mm256_subs_epi16(logmax_num_re0, logmax_den_re0);

    // Detection for 6th bit (LTE mapping)
    xmm0 = _mm256_max_epi16(bit_met_p7_p7, bit_met_p5_p7);
    xmm1 = _mm256_max_epi16(bit_met_p3_p7, bit_met_p1_p7);
    xmm2 = _mm256_max_epi16(bit_met_m1_p7, bit_met_m3_p7);
    xmm3 = _mm256_max_epi16(bit_met_m5_p7, bit_met_m7_p7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(xmm4, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_p1, bit_met_p5_p1);
    xmm1 = _mm256_max_epi16(bit_met_p3_p1, bit_met_p1_p1);
    xmm2 = _mm256_max_epi16(bit_met_m1_p1, bit_met_m3_p1);
    xmm3 = _mm256_max_epi16(bit_met_m5_p1, bit_met_m5_p1);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_m1, bit_met_p5_m1);
    xmm1 = _mm256_max_epi16(bit_met_p3_m1, bit_met_p1_m1);
    xmm2 = _mm256_max_epi16(bit_met_m1_m1, bit_met_m3_m1);
    xmm3 = _mm256_max_epi16(bit_met_m5_m1, bit_met_m7_m1);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_m7, bit_met_p5_m7);
    xmm1 = _mm256_max_epi16(bit_met_p3_m7, bit_met_p1_m7);
    xmm2 = _mm256_max_epi16(bit_met_m1_m7, bit_met_m3_m7);
    xmm3 = _mm256_max_epi16(bit_met_m5_m7, bit_met_m7_m7);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm256_max_epi16(logmax_den_re0, xmm5);

    xmm0 = _mm256_max_epi16(bit_met_p7_m5, bit_met_p5_m5);
    xmm1 = _mm256_max_epi16(bit_met_p3_m5, bit_met_p1_m5);
    xmm2 = _mm256_max_epi16(bit_met_m1_m5, bit_met_m3_m5);
    xmm3 = _mm256_max_epi16(bit_met_m5_m5, bit_met_m7_m5);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(xmm4, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_m3, bit_met_p5_m3);
    xmm1 = _mm256_max_epi16(bit_met_p3_m3, bit_met_p1_m3);
    xmm2 = _mm256_max_epi16(bit_met_m1_m3, bit_met_m3_m3);
    xmm3 = _mm256_max_epi16(bit_met_m5_m3, bit_met_m7_m3);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_p3, bit_met_p5_p3);
    xmm1 = _mm256_max_epi16(bit_met_p3_p3, bit_met_p1_p3);
    xmm2 = _mm256_max_epi16(bit_met_m1_p3, bit_met_m3_p3);
    xmm3 = _mm256_max_epi16(bit_met_m5_p3, bit_met_m7_p3);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm256_max_epi16(bit_met_p7_p5, bit_met_p5_p5);
    xmm1 = _mm256_max_epi16(bit_met_p3_p5, bit_met_p1_p5);
    xmm2 = _mm256_max_epi16(bit_met_m1_p5, bit_met_m3_p5);
    xmm3 = _mm256_max_epi16(bit_met_m5_p5, bit_met_m7_p5);
    xmm4 = _mm256_max_epi16(xmm0, xmm1);
    xmm5 = _mm256_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm256_max_epi16(logmax_num_re0, xmm5);

    y2i = _mm256_subs_epi16(logmax_num_re0, logmax_den_re0);

    // map to output stream, difficult to do in SIMD since we have 6 16bit LLRs
    // RE 1
    j = 48*i;
    stream0_out[j + 0] = ((short *)&y0r)[0];
    stream0_out[j + 1] = ((short *)&y1r)[0];
    stream0_out[j + 2] = ((short *)&y2r)[0];
    stream0_out[j + 3] = ((short *)&y0i)[0];
    stream0_out[j + 4] = ((short *)&y1i)[0];
    stream0_out[j + 5] = ((short *)&y2i)[0];
    // RE 2
    stream0_out[j + 6] = ((short *)&y0r)[1];
    stream0_out[j + 7] = ((short *)&y1r)[1];
    stream0_out[j + 8] = ((short *)&y2r)[1];
    stream0_out[j + 9] = ((short *)&y0i)[1];
    stream0_out[j + 10] = ((short *)&y1i)[1];
    stream0_out[j + 11] = ((short *)&y2i)[1];
    // RE 3
    stream0_out[j + 12] = ((short *)&y0r)[2];
    stream0_out[j + 13] = ((short *)&y1r)[2];
    stream0_out[j + 14] = ((short *)&y2r)[2];
    stream0_out[j + 15] = ((short *)&y0i)[2];
    stream0_out[j + 16] = ((short *)&y1i)[2];
    stream0_out[j + 17] = ((short *)&y2i)[2];
    // RE 4
    stream0_out[j + 18] = ((short *)&y0r)[3];
    stream0_out[j + 19] = ((short *)&y1r)[3];
    stream0_out[j + 20] = ((short *)&y2r)[3];
    stream0_out[j + 21] = ((short *)&y0i)[3];
    stream0_out[j + 22] = ((short *)&y1i)[3];
    stream0_out[j + 23] = ((short *)&y2i)[3];
    // RE 5
    stream0_out[j + 24] = ((short *)&y0r)[4];
    stream0_out[j + 25] = ((short *)&y1r)[4];
    stream0_out[j + 26] = ((short *)&y2r)[4];
    stream0_out[j + 27] = ((short *)&y0i)[4];
    stream0_out[j + 28] = ((short *)&y1i)[4];
    stream0_out[j + 29] = ((short *)&y2i)[4];
    // RE 6
    stream0_out[j + 30] = ((short *)&y0r)[5];
    stream0_out[j + 31] = ((short *)&y1r)[5];
    stream0_out[j + 32] = ((short *)&y2r)[5];
    stream0_out[j + 33] = ((short *)&y0i)[5];
    stream0_out[j + 34] = ((short *)&y1i)[5];
    stream0_out[j + 35] = ((short *)&y2i)[5];
    // RE 7
    stream0_out[j + 36] = ((short *)&y0r)[6];
    stream0_out[j + 37] = ((short *)&y1r)[6];
    stream0_out[j + 38] = ((short *)&y2r)[6];
    stream0_out[j + 39] = ((short *)&y0i)[6];
    stream0_out[j + 40] = ((short *)&y1i)[6];
    stream0_out[j + 41] = ((short *)&y2i)[6];
    // RE 8
    stream0_out[j + 42] = ((short *)&y0r)[7];
    stream0_out[j + 43] = ((short *)&y1r)[7];
    stream0_out[j + 44] = ((short *)&y2r)[7];
    stream0_out[j + 45] = ((short *)&y0i)[7];
    stream0_out[j + 46] = ((short *)&y1i)[7];
    stream0_out[j + 47] = ((short *)&y2i)[7];

    // RE 9
    stream0_out[j + 48] = ((short *)&y0r)[8];
    stream0_out[j + 49] = ((short *)&y1r)[8];
    stream0_out[j + 50] = ((short *)&y2r)[8];
    stream0_out[j + 51] = ((short *)&y0i)[8];
    stream0_out[j + 52] = ((short *)&y1i)[8];
    stream0_out[j + 53] = ((short *)&y2i)[8];
    // RE 10
    stream0_out[j + 54] = ((short *)&y0r)[9];
    stream0_out[j + 55] = ((short *)&y1r)[9];
    stream0_out[j + 56] = ((short *)&y2r)[9];
    stream0_out[j + 57] = ((short *)&y0i)[9];
    stream0_out[j + 58] = ((short *)&y1i)[9];
    stream0_out[j + 59] = ((short *)&y2i)[9];
    // RE 11
    stream0_out[j + 60] = ((short *)&y0r)[10];
    stream0_out[j + 61] = ((short *)&y1r)[10];
    stream0_out[j + 62] = ((short *)&y2r)[10];
    stream0_out[j + 63] = ((short *)&y0i)[10];
    stream0_out[j + 64] = ((short *)&y1i)[10];
    stream0_out[j + 65] = ((short *)&y2i)[10];
    // RE 12
    stream0_out[j + 66] = ((short *)&y0r)[11];
    stream0_out[j + 67] = ((short *)&y1r)[11];
    stream0_out[j + 68] = ((short *)&y2r)[11];
    stream0_out[j + 69] = ((short *)&y0i)[11];
    stream0_out[j + 70] = ((short *)&y1i)[11];
    stream0_out[j + 71] = ((short *)&y2i)[11];
    // RE 13
    stream0_out[j + 72] = ((short *)&y0r)[12];
    stream0_out[j + 73] = ((short *)&y1r)[12];
    stream0_out[j + 74] = ((short *)&y2r)[12];
    stream0_out[j + 75] = ((short *)&y0i)[12];
    stream0_out[j + 76] = ((short *)&y1i)[12];
    stream0_out[j + 77] = ((short *)&y2i)[12];
    // RE 14
    stream0_out[j + 78] = ((short *)&y0r)[13];
    stream0_out[j + 79] = ((short *)&y1r)[13];
    stream0_out[j + 80] = ((short *)&y2r)[13];
    stream0_out[j + 81] = ((short *)&y0i)[13];
    stream0_out[j + 82] = ((short *)&y1i)[13];
    stream0_out[j + 83] = ((short *)&y2i)[13];
    // RE 15
    stream0_out[j + 84] = ((short *)&y0r)[14];
    stream0_out[j + 85] = ((short *)&y1r)[14];
    stream0_out[j + 86] = ((short *)&y2r)[14];
    stream0_out[j + 87] = ((short *)&y0i)[14];
    stream0_out[j + 88] = ((short *)&y1i)[14];
    stream0_out[j + 89] = ((short *)&y2i)[14];
    // RE 16
    stream0_out[j + 90] = ((short *)&y0r)[15];
    stream0_out[j + 91] = ((short *)&y1r)[15];
    stream0_out[j + 92] = ((short *)&y2r)[15];
    stream0_out[j + 93] = ((short *)&y0i)[15];
    stream0_out[j + 94] = ((short *)&y1i)[15];
    stream0_out[j + 95] = ((short *)&y2i)[15];

#elif defined(__arm__)

#endif

  }

#if defined(__x86_64__) || defined(__i386__)
  _mm_empty();
  _m_empty();
#endif
}