/*
 * Licensed to the OpenAirInterface (OAI) Software Alliance under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The OpenAirInterface Software Alliance licenses this file to You under
 * the OAI Public License, Version 1.1  (the "License"); you may not use this file
 * except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.openairinterface.org/?page_id=698
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *-------------------------------------------------------------------------------
 * For more information about the OpenAirInterface (OAI) Software Alliance:
 *      contact@openairinterface.org
 */

/*! \file PHY/NR_UE_TRANSPORT/nr_dlsch_llr_computation.c
 * \brief Top-level routines for LLR computation of the PDSCH physical channel
 * \author H. WANG
 * \date 2018
 * \version 0.1
 * \company Eurecom
 * \email:
 * \note
 * \warning
 */

#include "PHY/defs_nr_UE.h"
#include "PHY/phy_extern_nr_ue.h"
#include "nr_transport_proto_ue.h"
#include "PHY/TOOLS/tools_defs.h"
#include "PHY/sse_intrin.h"

//#define DEBUG_LLR_SIC


int16_t nr_zeros[8] __attribute__ ((aligned(16))) = {0,0,0,0,0,0,0,0};
int16_t nr_ones[8] __attribute__ ((aligned(16))) = {0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff,0xffff};
#if defined(__x86_64__) || defined(__i386__)
__m128i rho_rpi __attribute__ ((aligned(16)));
__m128i rho_rmi __attribute__ ((aligned(16)));
__m128i rho_rpi_1_1 __attribute__ ((aligned(16)));
__m128i rho_rpi_1_3 __attribute__ ((aligned(16)));
__m128i rho_rpi_1_5 __attribute__ ((aligned(16)));
__m128i rho_rpi_1_7 __attribute__ ((aligned(16)));
__m128i rho_rpi_3_1 __attribute__ ((aligned(16)));
__m128i rho_rpi_3_3 __attribute__ ((aligned(16)));
__m128i rho_rpi_3_5 __attribute__ ((aligned(16)));
__m128i rho_rpi_3_7 __attribute__ ((aligned(16)));
__m128i rho_rpi_5_1 __attribute__ ((aligned(16)));
__m128i rho_rpi_5_3 __attribute__ ((aligned(16)));
__m128i rho_rpi_5_5 __attribute__ ((aligned(16)));
__m128i rho_rpi_5_7 __attribute__ ((aligned(16)));
__m128i rho_rpi_7_1 __attribute__ ((aligned(16)));
__m128i rho_rpi_7_3 __attribute__ ((aligned(16)));
__m128i rho_rpi_7_5 __attribute__ ((aligned(16)));
__m128i rho_rpi_7_7 __attribute__ ((aligned(16)));
__m128i rho_rmi_1_1 __attribute__ ((aligned(16)));
__m128i rho_rmi_1_3 __attribute__ ((aligned(16)));
__m128i rho_rmi_1_5 __attribute__ ((aligned(16)));
__m128i rho_rmi_1_7 __attribute__ ((aligned(16)));
__m128i rho_rmi_3_1 __attribute__ ((aligned(16)));
__m128i rho_rmi_3_3 __attribute__ ((aligned(16)));
__m128i rho_rmi_3_5 __attribute__ ((aligned(16)));
__m128i rho_rmi_3_7 __attribute__ ((aligned(16)));
__m128i rho_rmi_5_1 __attribute__ ((aligned(16)));
__m128i rho_rmi_5_3 __attribute__ ((aligned(16)));
__m128i rho_rmi_5_5 __attribute__ ((aligned(16)));
__m128i rho_rmi_5_7 __attribute__ ((aligned(16)));
__m128i rho_rmi_7_1 __attribute__ ((aligned(16)));
__m128i rho_rmi_7_3 __attribute__ ((aligned(16)));
__m128i rho_rmi_7_5 __attribute__ ((aligned(16)));
__m128i rho_rmi_7_7 __attribute__ ((aligned(16)));

__m128i psi_r_m7_m7 __attribute__ ((aligned(16)));
__m128i psi_r_m7_m5 __attribute__ ((aligned(16)));
__m128i psi_r_m7_m3 __attribute__ ((aligned(16)));
__m128i psi_r_m7_m1 __attribute__ ((aligned(16)));
__m128i psi_r_m7_p1 __attribute__ ((aligned(16)));
__m128i psi_r_m7_p3 __attribute__ ((aligned(16)));
__m128i psi_r_m7_p5 __attribute__ ((aligned(16)));
__m128i psi_r_m7_p7 __attribute__ ((aligned(16)));
__m128i psi_r_m5_m7 __attribute__ ((aligned(16)));
__m128i psi_r_m5_m5 __attribute__ ((aligned(16)));
__m128i psi_r_m5_m3 __attribute__ ((aligned(16)));
__m128i psi_r_m5_m1 __attribute__ ((aligned(16)));
__m128i psi_r_m5_p1 __attribute__ ((aligned(16)));
__m128i psi_r_m5_p3 __attribute__ ((aligned(16)));
__m128i psi_r_m5_p5 __attribute__ ((aligned(16)));
__m128i psi_r_m5_p7 __attribute__ ((aligned(16)));
__m128i psi_r_m3_m7 __attribute__ ((aligned(16)));
__m128i psi_r_m3_m5 __attribute__ ((aligned(16)));
__m128i psi_r_m3_m3 __attribute__ ((aligned(16)));
__m128i psi_r_m3_m1 __attribute__ ((aligned(16)));
__m128i psi_r_m3_p1 __attribute__ ((aligned(16)));
__m128i psi_r_m3_p3 __attribute__ ((aligned(16)));
__m128i psi_r_m3_p5 __attribute__ ((aligned(16)));
__m128i psi_r_m3_p7 __attribute__ ((aligned(16)));
__m128i psi_r_m1_m7 __attribute__ ((aligned(16)));
__m128i psi_r_m1_m5 __attribute__ ((aligned(16)));
__m128i psi_r_m1_m3 __attribute__ ((aligned(16)));
__m128i psi_r_m1_m1 __attribute__ ((aligned(16)));
__m128i psi_r_m1_p1 __attribute__ ((aligned(16)));
__m128i psi_r_m1_p3 __attribute__ ((aligned(16)));
__m128i psi_r_m1_p5 __attribute__ ((aligned(16)));
__m128i psi_r_m1_p7 __attribute__ ((aligned(16)));
__m128i psi_r_p1_m7 __attribute__ ((aligned(16)));
__m128i psi_r_p1_m5 __attribute__ ((aligned(16)));
__m128i psi_r_p1_m3 __attribute__ ((aligned(16)));
__m128i psi_r_p1_m1 __attribute__ ((aligned(16)));
__m128i psi_r_p1_p1 __attribute__ ((aligned(16)));
__m128i psi_r_p1_p3 __attribute__ ((aligned(16)));
__m128i psi_r_p1_p5 __attribute__ ((aligned(16)));
__m128i psi_r_p1_p7 __attribute__ ((aligned(16)));
__m128i psi_r_p3_m7 __attribute__ ((aligned(16)));
__m128i psi_r_p3_m5 __attribute__ ((aligned(16)));
__m128i psi_r_p3_m3 __attribute__ ((aligned(16)));
__m128i psi_r_p3_m1 __attribute__ ((aligned(16)));
__m128i psi_r_p3_p1 __attribute__ ((aligned(16)));
__m128i psi_r_p3_p3 __attribute__ ((aligned(16)));
__m128i psi_r_p3_p5 __attribute__ ((aligned(16)));
__m128i psi_r_p3_p7 __attribute__ ((aligned(16)));
__m128i psi_r_p5_m7 __attribute__ ((aligned(16)));
__m128i psi_r_p5_m5 __attribute__ ((aligned(16)));
__m128i psi_r_p5_m3 __attribute__ ((aligned(16)));
__m128i psi_r_p5_m1 __attribute__ ((aligned(16)));
__m128i psi_r_p5_p1 __attribute__ ((aligned(16)));
__m128i psi_r_p5_p3 __attribute__ ((aligned(16)));
__m128i psi_r_p5_p5 __attribute__ ((aligned(16)));
__m128i psi_r_p5_p7 __attribute__ ((aligned(16)));
__m128i psi_r_p7_m7 __attribute__ ((aligned(16)));
__m128i psi_r_p7_m5 __attribute__ ((aligned(16)));
__m128i psi_r_p7_m3 __attribute__ ((aligned(16)));
__m128i psi_r_p7_m1 __attribute__ ((aligned(16)));
__m128i psi_r_p7_p1 __attribute__ ((aligned(16)));
__m128i psi_r_p7_p3 __attribute__ ((aligned(16)));
__m128i psi_r_p7_p5 __attribute__ ((aligned(16)));
__m128i psi_r_p7_p7 __attribute__ ((aligned(16)));

__m128i psi_i_m7_m7 __attribute__ ((aligned(16)));
__m128i psi_i_m7_m5 __attribute__ ((aligned(16)));
__m128i psi_i_m7_m3 __attribute__ ((aligned(16)));
__m128i psi_i_m7_m1 __attribute__ ((aligned(16)));
__m128i psi_i_m7_p1 __attribute__ ((aligned(16)));
__m128i psi_i_m7_p3 __attribute__ ((aligned(16)));
__m128i psi_i_m7_p5 __attribute__ ((aligned(16)));
__m128i psi_i_m7_p7 __attribute__ ((aligned(16)));
__m128i psi_i_m5_m7 __attribute__ ((aligned(16)));
__m128i psi_i_m5_m5 __attribute__ ((aligned(16)));
__m128i psi_i_m5_m3 __attribute__ ((aligned(16)));
__m128i psi_i_m5_m1 __attribute__ ((aligned(16)));
__m128i psi_i_m5_p1 __attribute__ ((aligned(16)));
__m128i psi_i_m5_p3 __attribute__ ((aligned(16)));
__m128i psi_i_m5_p5 __attribute__ ((aligned(16)));
__m128i psi_i_m5_p7 __attribute__ ((aligned(16)));
__m128i psi_i_m3_m7 __attribute__ ((aligned(16)));
__m128i psi_i_m3_m5 __attribute__ ((aligned(16)));
__m128i psi_i_m3_m3 __attribute__ ((aligned(16)));
__m128i psi_i_m3_m1 __attribute__ ((aligned(16)));
__m128i psi_i_m3_p1 __attribute__ ((aligned(16)));
__m128i psi_i_m3_p3 __attribute__ ((aligned(16)));
__m128i psi_i_m3_p5 __attribute__ ((aligned(16)));
__m128i psi_i_m3_p7 __attribute__ ((aligned(16)));
__m128i psi_i_m1_m7 __attribute__ ((aligned(16)));
__m128i psi_i_m1_m5 __attribute__ ((aligned(16)));
__m128i psi_i_m1_m3 __attribute__ ((aligned(16)));
__m128i psi_i_m1_m1 __attribute__ ((aligned(16)));
__m128i psi_i_m1_p1 __attribute__ ((aligned(16)));
__m128i psi_i_m1_p3 __attribute__ ((aligned(16)));
__m128i psi_i_m1_p5 __attribute__ ((aligned(16)));
__m128i psi_i_m1_p7 __attribute__ ((aligned(16)));
__m128i psi_i_p1_m7 __attribute__ ((aligned(16)));
__m128i psi_i_p1_m5 __attribute__ ((aligned(16)));
__m128i psi_i_p1_m3 __attribute__ ((aligned(16)));
__m128i psi_i_p1_m1 __attribute__ ((aligned(16)));
__m128i psi_i_p1_p1 __attribute__ ((aligned(16)));
__m128i psi_i_p1_p3 __attribute__ ((aligned(16)));
__m128i psi_i_p1_p5 __attribute__ ((aligned(16)));
__m128i psi_i_p1_p7 __attribute__ ((aligned(16)));
__m128i psi_i_p3_m7 __attribute__ ((aligned(16)));
__m128i psi_i_p3_m5 __attribute__ ((aligned(16)));
__m128i psi_i_p3_m3 __attribute__ ((aligned(16)));
__m128i psi_i_p3_m1 __attribute__ ((aligned(16)));
__m128i psi_i_p3_p1 __attribute__ ((aligned(16)));
__m128i psi_i_p3_p3 __attribute__ ((aligned(16)));
__m128i psi_i_p3_p5 __attribute__ ((aligned(16)));
__m128i psi_i_p3_p7 __attribute__ ((aligned(16)));
__m128i psi_i_p5_m7 __attribute__ ((aligned(16)));
__m128i psi_i_p5_m5 __attribute__ ((aligned(16)));
__m128i psi_i_p5_m3 __attribute__ ((aligned(16)));
__m128i psi_i_p5_m1 __attribute__ ((aligned(16)));
__m128i psi_i_p5_p1 __attribute__ ((aligned(16)));
__m128i psi_i_p5_p3 __attribute__ ((aligned(16)));
__m128i psi_i_p5_p5 __attribute__ ((aligned(16)));
__m128i psi_i_p5_p7 __attribute__ ((aligned(16)));
__m128i psi_i_p7_m7 __attribute__ ((aligned(16)));
__m128i psi_i_p7_m5 __attribute__ ((aligned(16)));
__m128i psi_i_p7_m3 __attribute__ ((aligned(16)));
__m128i psi_i_p7_m1 __attribute__ ((aligned(16)));
__m128i psi_i_p7_p1 __attribute__ ((aligned(16)));
__m128i psi_i_p7_p3 __attribute__ ((aligned(16)));
__m128i psi_i_p7_p5 __attribute__ ((aligned(16)));
__m128i psi_i_p7_p7 __attribute__ ((aligned(16)));

__m128i a_r_m7_m7 __attribute__ ((aligned(16)));
__m128i a_r_m7_m5 __attribute__ ((aligned(16)));
__m128i a_r_m7_m3 __attribute__ ((aligned(16)));
__m128i a_r_m7_m1 __attribute__ ((aligned(16)));
__m128i a_r_m7_p1 __attribute__ ((aligned(16)));
__m128i a_r_m7_p3 __attribute__ ((aligned(16)));
__m128i a_r_m7_p5 __attribute__ ((aligned(16)));
__m128i a_r_m7_p7 __attribute__ ((aligned(16)));
__m128i a_r_m5_m7 __attribute__ ((aligned(16)));
__m128i a_r_m5_m5 __attribute__ ((aligned(16)));
__m128i a_r_m5_m3 __attribute__ ((aligned(16)));
__m128i a_r_m5_m1 __attribute__ ((aligned(16)));
__m128i a_r_m5_p1 __attribute__ ((aligned(16)));
__m128i a_r_m5_p3 __attribute__ ((aligned(16)));
__m128i a_r_m5_p5 __attribute__ ((aligned(16)));
__m128i a_r_m5_p7 __attribute__ ((aligned(16)));
__m128i a_r_m3_m7 __attribute__ ((aligned(16)));
__m128i a_r_m3_m5 __attribute__ ((aligned(16)));
__m128i a_r_m3_m3 __attribute__ ((aligned(16)));
__m128i a_r_m3_m1 __attribute__ ((aligned(16)));
__m128i a_r_m3_p1 __attribute__ ((aligned(16)));
__m128i a_r_m3_p3 __attribute__ ((aligned(16)));
__m128i a_r_m3_p5 __attribute__ ((aligned(16)));
__m128i a_r_m3_p7 __attribute__ ((aligned(16)));
__m128i a_r_m1_m7 __attribute__ ((aligned(16)));
__m128i a_r_m1_m5 __attribute__ ((aligned(16)));
__m128i a_r_m1_m3 __attribute__ ((aligned(16)));
__m128i a_r_m1_m1 __attribute__ ((aligned(16)));
__m128i a_r_m1_p1 __attribute__ ((aligned(16)));
__m128i a_r_m1_p3 __attribute__ ((aligned(16)));
__m128i a_r_m1_p5 __attribute__ ((aligned(16)));
__m128i a_r_m1_p7 __attribute__ ((aligned(16)));
__m128i a_r_p1_m7 __attribute__ ((aligned(16)));
__m128i a_r_p1_m5 __attribute__ ((aligned(16)));
__m128i a_r_p1_m3 __attribute__ ((aligned(16)));
__m128i a_r_p1_m1 __attribute__ ((aligned(16)));
__m128i a_r_p1_p1 __attribute__ ((aligned(16)));
__m128i a_r_p1_p3 __attribute__ ((aligned(16)));
__m128i a_r_p1_p5 __attribute__ ((aligned(16)));
__m128i a_r_p1_p7 __attribute__ ((aligned(16)));
__m128i a_r_p3_m7 __attribute__ ((aligned(16)));
__m128i a_r_p3_m5 __attribute__ ((aligned(16)));
__m128i a_r_p3_m3 __attribute__ ((aligned(16)));
__m128i a_r_p3_m1 __attribute__ ((aligned(16)));
__m128i a_r_p3_p1 __attribute__ ((aligned(16)));
__m128i a_r_p3_p3 __attribute__ ((aligned(16)));
__m128i a_r_p3_p5 __attribute__ ((aligned(16)));
__m128i a_r_p3_p7 __attribute__ ((aligned(16)));
__m128i a_r_p5_m7 __attribute__ ((aligned(16)));
__m128i a_r_p5_m5 __attribute__ ((aligned(16)));
__m128i a_r_p5_m3 __attribute__ ((aligned(16)));
__m128i a_r_p5_m1 __attribute__ ((aligned(16)));
__m128i a_r_p5_p1 __attribute__ ((aligned(16)));
__m128i a_r_p5_p3 __attribute__ ((aligned(16)));
__m128i a_r_p5_p5 __attribute__ ((aligned(16)));
__m128i a_r_p5_p7 __attribute__ ((aligned(16)));
__m128i a_r_p7_m7 __attribute__ ((aligned(16)));
__m128i a_r_p7_m5 __attribute__ ((aligned(16)));
__m128i a_r_p7_m3 __attribute__ ((aligned(16)));
__m128i a_r_p7_m1 __attribute__ ((aligned(16)));
__m128i a_r_p7_p1 __attribute__ ((aligned(16)));
__m128i a_r_p7_p3 __attribute__ ((aligned(16)));
__m128i a_r_p7_p5 __attribute__ ((aligned(16)));
__m128i a_r_p7_p7 __attribute__ ((aligned(16)));

__m128i a_i_m7_m7 __attribute__ ((aligned(16)));
__m128i a_i_m7_m5 __attribute__ ((aligned(16)));
__m128i a_i_m7_m3 __attribute__ ((aligned(16)));
__m128i a_i_m7_m1 __attribute__ ((aligned(16)));
__m128i a_i_m7_p1 __attribute__ ((aligned(16)));
__m128i a_i_m7_p3 __attribute__ ((aligned(16)));
__m128i a_i_m7_p5 __attribute__ ((aligned(16)));
__m128i a_i_m7_p7 __attribute__ ((aligned(16)));
__m128i a_i_m5_m7 __attribute__ ((aligned(16)));
__m128i a_i_m5_m5 __attribute__ ((aligned(16)));
__m128i a_i_m5_m3 __attribute__ ((aligned(16)));
__m128i a_i_m5_m1 __attribute__ ((aligned(16)));
__m128i a_i_m5_p1 __attribute__ ((aligned(16)));
__m128i a_i_m5_p3 __attribute__ ((aligned(16)));
__m128i a_i_m5_p5 __attribute__ ((aligned(16)));
__m128i a_i_m5_p7 __attribute__ ((aligned(16)));
__m128i a_i_m3_m7 __attribute__ ((aligned(16)));
__m128i a_i_m3_m5 __attribute__ ((aligned(16)));
__m128i a_i_m3_m3 __attribute__ ((aligned(16)));
__m128i a_i_m3_m1 __attribute__ ((aligned(16)));
__m128i a_i_m3_p1 __attribute__ ((aligned(16)));
__m128i a_i_m3_p3 __attribute__ ((aligned(16)));
__m128i a_i_m3_p5 __attribute__ ((aligned(16)));
__m128i a_i_m3_p7 __attribute__ ((aligned(16)));
__m128i a_i_m1_m7 __attribute__ ((aligned(16)));
__m128i a_i_m1_m5 __attribute__ ((aligned(16)));
__m128i a_i_m1_m3 __attribute__ ((aligned(16)));
__m128i a_i_m1_m1 __attribute__ ((aligned(16)));
__m128i a_i_m1_p1 __attribute__ ((aligned(16)));
__m128i a_i_m1_p3 __attribute__ ((aligned(16)));
__m128i a_i_m1_p5 __attribute__ ((aligned(16)));
__m128i a_i_m1_p7 __attribute__ ((aligned(16)));
__m128i a_i_p1_m7 __attribute__ ((aligned(16)));
__m128i a_i_p1_m5 __attribute__ ((aligned(16)));
__m128i a_i_p1_m3 __attribute__ ((aligned(16)));
__m128i a_i_p1_m1 __attribute__ ((aligned(16)));
__m128i a_i_p1_p1 __attribute__ ((aligned(16)));
__m128i a_i_p1_p3 __attribute__ ((aligned(16)));
__m128i a_i_p1_p5 __attribute__ ((aligned(16)));
__m128i a_i_p1_p7 __attribute__ ((aligned(16)));
__m128i a_i_p3_m7 __attribute__ ((aligned(16)));
__m128i a_i_p3_m5 __attribute__ ((aligned(16)));
__m128i a_i_p3_m3 __attribute__ ((aligned(16)));
__m128i a_i_p3_m1 __attribute__ ((aligned(16)));
__m128i a_i_p3_p1 __attribute__ ((aligned(16)));
__m128i a_i_p3_p3 __attribute__ ((aligned(16)));
__m128i a_i_p3_p5 __attribute__ ((aligned(16)));
__m128i a_i_p3_p7 __attribute__ ((aligned(16)));
__m128i a_i_p5_m7 __attribute__ ((aligned(16)));
__m128i a_i_p5_m5 __attribute__ ((aligned(16)));
__m128i a_i_p5_m3 __attribute__ ((aligned(16)));
__m128i a_i_p5_m1 __attribute__ ((aligned(16)));
__m128i a_i_p5_p1 __attribute__ ((aligned(16)));
__m128i a_i_p5_p3 __attribute__ ((aligned(16)));
__m128i a_i_p5_p5 __attribute__ ((aligned(16)));
__m128i a_i_p5_p7 __attribute__ ((aligned(16)));
__m128i a_i_p7_m7 __attribute__ ((aligned(16)));
__m128i a_i_p7_m5 __attribute__ ((aligned(16)));
__m128i a_i_p7_m3 __attribute__ ((aligned(16)));
__m128i a_i_p7_m1 __attribute__ ((aligned(16)));
__m128i a_i_p7_p1 __attribute__ ((aligned(16)));
__m128i a_i_p7_p3 __attribute__ ((aligned(16)));
__m128i a_i_p7_p5 __attribute__ ((aligned(16)));
__m128i a_i_p7_p7 __attribute__ ((aligned(16)));

__m128i psi_a_m7_m7 __attribute__ ((aligned(16)));
__m128i psi_a_m7_m5 __attribute__ ((aligned(16)));
__m128i psi_a_m7_m3 __attribute__ ((aligned(16)));
__m128i psi_a_m7_m1 __attribute__ ((aligned(16)));
__m128i psi_a_m7_p1 __attribute__ ((aligned(16)));
__m128i psi_a_m7_p3 __attribute__ ((aligned(16)));
__m128i psi_a_m7_p5 __attribute__ ((aligned(16)));
__m128i psi_a_m7_p7 __attribute__ ((aligned(16)));
__m128i psi_a_m5_m7 __attribute__ ((aligned(16)));
__m128i psi_a_m5_m5 __attribute__ ((aligned(16)));
__m128i psi_a_m5_m3 __attribute__ ((aligned(16)));
__m128i psi_a_m5_m1 __attribute__ ((aligned(16)));
__m128i psi_a_m5_p1 __attribute__ ((aligned(16)));
__m128i psi_a_m5_p3 __attribute__ ((aligned(16)));
__m128i psi_a_m5_p5 __attribute__ ((aligned(16)));
__m128i psi_a_m5_p7 __attribute__ ((aligned(16)));
__m128i psi_a_m3_m7 __attribute__ ((aligned(16)));
__m128i psi_a_m3_m5 __attribute__ ((aligned(16)));
__m128i psi_a_m3_m3 __attribute__ ((aligned(16)));
__m128i psi_a_m3_m1 __attribute__ ((aligned(16)));
__m128i psi_a_m3_p1 __attribute__ ((aligned(16)));
__m128i psi_a_m3_p3 __attribute__ ((aligned(16)));
__m128i psi_a_m3_p5 __attribute__ ((aligned(16)));
__m128i psi_a_m3_p7 __attribute__ ((aligned(16)));
__m128i psi_a_m1_m7 __attribute__ ((aligned(16)));
__m128i psi_a_m1_m5 __attribute__ ((aligned(16)));
__m128i psi_a_m1_m3 __attribute__ ((aligned(16)));
__m128i psi_a_m1_m1 __attribute__ ((aligned(16)));
__m128i psi_a_m1_p1 __attribute__ ((aligned(16)));
__m128i psi_a_m1_p3 __attribute__ ((aligned(16)));
__m128i psi_a_m1_p5 __attribute__ ((aligned(16)));
__m128i psi_a_m1_p7 __attribute__ ((aligned(16)));
__m128i psi_a_p1_m7 __attribute__ ((aligned(16)));
__m128i psi_a_p1_m5 __attribute__ ((aligned(16)));
__m128i psi_a_p1_m3 __attribute__ ((aligned(16)));
__m128i psi_a_p1_m1 __attribute__ ((aligned(16)));
__m128i psi_a_p1_p1 __attribute__ ((aligned(16)));
__m128i psi_a_p1_p3 __attribute__ ((aligned(16)));
__m128i psi_a_p1_p5 __attribute__ ((aligned(16)));
__m128i psi_a_p1_p7 __attribute__ ((aligned(16)));
__m128i psi_a_p3_m7 __attribute__ ((aligned(16)));
__m128i psi_a_p3_m5 __attribute__ ((aligned(16)));
__m128i psi_a_p3_m3 __attribute__ ((aligned(16)));
__m128i psi_a_p3_m1 __attribute__ ((aligned(16)));
__m128i psi_a_p3_p1 __attribute__ ((aligned(16)));
__m128i psi_a_p3_p3 __attribute__ ((aligned(16)));
__m128i psi_a_p3_p5 __attribute__ ((aligned(16)));
__m128i psi_a_p3_p7 __attribute__ ((aligned(16)));
__m128i psi_a_p5_m7 __attribute__ ((aligned(16)));
__m128i psi_a_p5_m5 __attribute__ ((aligned(16)));
__m128i psi_a_p5_m3 __attribute__ ((aligned(16)));
__m128i psi_a_p5_m1 __attribute__ ((aligned(16)));
__m128i psi_a_p5_p1 __attribute__ ((aligned(16)));
__m128i psi_a_p5_p3 __attribute__ ((aligned(16)));
__m128i psi_a_p5_p5 __attribute__ ((aligned(16)));
__m128i psi_a_p5_p7 __attribute__ ((aligned(16)));
__m128i psi_a_p7_m7 __attribute__ ((aligned(16)));
__m128i psi_a_p7_m5 __attribute__ ((aligned(16)));
__m128i psi_a_p7_m3 __attribute__ ((aligned(16)));
__m128i psi_a_p7_m1 __attribute__ ((aligned(16)));
__m128i psi_a_p7_p1 __attribute__ ((aligned(16)));
__m128i psi_a_p7_p3 __attribute__ ((aligned(16)));
__m128i psi_a_p7_p5 __attribute__ ((aligned(16)));
__m128i psi_a_p7_p7 __attribute__ ((aligned(16)));

__m128i a_sq_m7_m7 __attribute__ ((aligned(16)));
__m128i a_sq_m7_m5 __attribute__ ((aligned(16)));
__m128i a_sq_m7_m3 __attribute__ ((aligned(16)));
__m128i a_sq_m7_m1 __attribute__ ((aligned(16)));
__m128i a_sq_m7_p1 __attribute__ ((aligned(16)));
__m128i a_sq_m7_p3 __attribute__ ((aligned(16)));
__m128i a_sq_m7_p5 __attribute__ ((aligned(16)));
__m128i a_sq_m7_p7 __attribute__ ((aligned(16)));
__m128i a_sq_m5_m7 __attribute__ ((aligned(16)));
__m128i a_sq_m5_m5 __attribute__ ((aligned(16)));
__m128i a_sq_m5_m3 __attribute__ ((aligned(16)));
__m128i a_sq_m5_m1 __attribute__ ((aligned(16)));
__m128i a_sq_m5_p1 __attribute__ ((aligned(16)));
__m128i a_sq_m5_p3 __attribute__ ((aligned(16)));
__m128i a_sq_m5_p5 __attribute__ ((aligned(16)));
__m128i a_sq_m5_p7 __attribute__ ((aligned(16)));
__m128i a_sq_m3_m7 __attribute__ ((aligned(16)));
__m128i a_sq_m3_m5 __attribute__ ((aligned(16)));
__m128i a_sq_m3_m3 __attribute__ ((aligned(16)));
__m128i a_sq_m3_m1 __attribute__ ((aligned(16)));
__m128i a_sq_m3_p1 __attribute__ ((aligned(16)));
__m128i a_sq_m3_p3 __attribute__ ((aligned(16)));
__m128i a_sq_m3_p5 __attribute__ ((aligned(16)));
__m128i a_sq_m3_p7 __attribute__ ((aligned(16)));
__m128i a_sq_m1_m7 __attribute__ ((aligned(16)));
__m128i a_sq_m1_m5 __attribute__ ((aligned(16)));
__m128i a_sq_m1_m3 __attribute__ ((aligned(16)));
__m128i a_sq_m1_m1 __attribute__ ((aligned(16)));
__m128i a_sq_m1_p1 __attribute__ ((aligned(16)));
__m128i a_sq_m1_p3 __attribute__ ((aligned(16)));
__m128i a_sq_m1_p5 __attribute__ ((aligned(16)));
__m128i a_sq_m1_p7 __attribute__ ((aligned(16)));
__m128i a_sq_p1_m7 __attribute__ ((aligned(16)));
__m128i a_sq_p1_m5 __attribute__ ((aligned(16)));
__m128i a_sq_p1_m3 __attribute__ ((aligned(16)));
__m128i a_sq_p1_m1 __attribute__ ((aligned(16)));
__m128i a_sq_p1_p1 __attribute__ ((aligned(16)));
__m128i a_sq_p1_p3 __attribute__ ((aligned(16)));
__m128i a_sq_p1_p5 __attribute__ ((aligned(16)));
__m128i a_sq_p1_p7 __attribute__ ((aligned(16)));
__m128i a_sq_p3_m7 __attribute__ ((aligned(16)));
__m128i a_sq_p3_m5 __attribute__ ((aligned(16)));
__m128i a_sq_p3_m3 __attribute__ ((aligned(16)));
__m128i a_sq_p3_m1 __attribute__ ((aligned(16)));
__m128i a_sq_p3_p1 __attribute__ ((aligned(16)));
__m128i a_sq_p3_p3 __attribute__ ((aligned(16)));
__m128i a_sq_p3_p5 __attribute__ ((aligned(16)));
__m128i a_sq_p3_p7 __attribute__ ((aligned(16)));
__m128i a_sq_p5_m7 __attribute__ ((aligned(16)));
__m128i a_sq_p5_m5 __attribute__ ((aligned(16)));
__m128i a_sq_p5_m3 __attribute__ ((aligned(16)));
__m128i a_sq_p5_m1 __attribute__ ((aligned(16)));
__m128i a_sq_p5_p1 __attribute__ ((aligned(16)));
__m128i a_sq_p5_p3 __attribute__ ((aligned(16)));
__m128i a_sq_p5_p5 __attribute__ ((aligned(16)));
__m128i a_sq_p5_p7 __attribute__ ((aligned(16)));
__m128i a_sq_p7_m7 __attribute__ ((aligned(16)));
__m128i a_sq_p7_m5 __attribute__ ((aligned(16)));
__m128i a_sq_p7_m3 __attribute__ ((aligned(16)));
__m128i a_sq_p7_m1 __attribute__ ((aligned(16)));
__m128i a_sq_p7_p1 __attribute__ ((aligned(16)));
__m128i a_sq_p7_p3 __attribute__ ((aligned(16)));
__m128i a_sq_p7_p5 __attribute__ ((aligned(16)));
__m128i a_sq_p7_p7 __attribute__ ((aligned(16)));

__m128i bit_met_m7_m7 __attribute__ ((aligned(16)));
__m128i bit_met_m7_m5 __attribute__ ((aligned(16)));
__m128i bit_met_m7_m3 __attribute__ ((aligned(16)));
__m128i bit_met_m7_m1 __attribute__ ((aligned(16)));
__m128i bit_met_m7_p1 __attribute__ ((aligned(16)));
__m128i bit_met_m7_p3 __attribute__ ((aligned(16)));
__m128i bit_met_m7_p5 __attribute__ ((aligned(16)));
__m128i bit_met_m7_p7 __attribute__ ((aligned(16)));
__m128i bit_met_m5_m7 __attribute__ ((aligned(16)));
__m128i bit_met_m5_m5 __attribute__ ((aligned(16)));
__m128i bit_met_m5_m3 __attribute__ ((aligned(16)));
__m128i bit_met_m5_m1 __attribute__ ((aligned(16)));
__m128i bit_met_m5_p1 __attribute__ ((aligned(16)));
__m128i bit_met_m5_p3 __attribute__ ((aligned(16)));
__m128i bit_met_m5_p5 __attribute__ ((aligned(16)));
__m128i bit_met_m5_p7 __attribute__ ((aligned(16)));
__m128i bit_met_m3_m7 __attribute__ ((aligned(16)));
__m128i bit_met_m3_m5 __attribute__ ((aligned(16)));
__m128i bit_met_m3_m3 __attribute__ ((aligned(16)));
__m128i bit_met_m3_m1 __attribute__ ((aligned(16)));
__m128i bit_met_m3_p1 __attribute__ ((aligned(16)));
__m128i bit_met_m3_p3 __attribute__ ((aligned(16)));
__m128i bit_met_m3_p5 __attribute__ ((aligned(16)));
__m128i bit_met_m3_p7 __attribute__ ((aligned(16)));
__m128i bit_met_m1_m7 __attribute__ ((aligned(16)));
__m128i bit_met_m1_m5 __attribute__ ((aligned(16)));
__m128i bit_met_m1_m3 __attribute__ ((aligned(16)));
__m128i bit_met_m1_m1 __attribute__ ((aligned(16)));
__m128i bit_met_m1_p1 __attribute__ ((aligned(16)));
__m128i bit_met_m1_p3 __attribute__ ((aligned(16)));
__m128i bit_met_m1_p5 __attribute__ ((aligned(16)));
__m128i bit_met_m1_p7 __attribute__ ((aligned(16)));
__m128i bit_met_p1_m7 __attribute__ ((aligned(16)));
__m128i bit_met_p1_m5 __attribute__ ((aligned(16)));
__m128i bit_met_p1_m3 __attribute__ ((aligned(16)));
__m128i bit_met_p1_m1 __attribute__ ((aligned(16)));
__m128i bit_met_p1_p1 __attribute__ ((aligned(16)));
__m128i bit_met_p1_p3 __attribute__ ((aligned(16)));
__m128i bit_met_p1_p5 __attribute__ ((aligned(16)));
__m128i bit_met_p1_p7 __attribute__ ((aligned(16)));
__m128i bit_met_p3_m7 __attribute__ ((aligned(16)));
__m128i bit_met_p3_m5 __attribute__ ((aligned(16)));
__m128i bit_met_p3_m3 __attribute__ ((aligned(16)));
__m128i bit_met_p3_m1 __attribute__ ((aligned(16)));
__m128i bit_met_p3_p1 __attribute__ ((aligned(16)));
__m128i bit_met_p3_p3 __attribute__ ((aligned(16)));
__m128i bit_met_p3_p5 __attribute__ ((aligned(16)));
__m128i bit_met_p3_p7 __attribute__ ((aligned(16)));
__m128i bit_met_p5_m7 __attribute__ ((aligned(16)));
__m128i bit_met_p5_m5 __attribute__ ((aligned(16)));
__m128i bit_met_p5_m3 __attribute__ ((aligned(16)));
__m128i bit_met_p5_m1 __attribute__ ((aligned(16)));
__m128i bit_met_p5_p1 __attribute__ ((aligned(16)));
__m128i bit_met_p5_p3 __attribute__ ((aligned(16)));
__m128i bit_met_p5_p5 __attribute__ ((aligned(16)));
__m128i bit_met_p5_p7 __attribute__ ((aligned(16)));
__m128i bit_met_p7_m7 __attribute__ ((aligned(16)));
__m128i bit_met_p7_m5 __attribute__ ((aligned(16)));
__m128i bit_met_p7_m3 __attribute__ ((aligned(16)));
__m128i bit_met_p7_m1 __attribute__ ((aligned(16)));
__m128i bit_met_p7_p1 __attribute__ ((aligned(16)));
__m128i bit_met_p7_p3 __attribute__ ((aligned(16)));
__m128i bit_met_p7_p5 __attribute__ ((aligned(16)));
__m128i bit_met_p7_p7 __attribute__ ((aligned(16)));

__m128i  y0_p_1_1 __attribute__ ((aligned(16)));
__m128i  y0_p_1_3 __attribute__ ((aligned(16)));
__m128i  y0_p_1_5 __attribute__ ((aligned(16)));
__m128i  y0_p_1_7 __attribute__ ((aligned(16)));
__m128i  y0_p_3_1 __attribute__ ((aligned(16)));
__m128i  y0_p_3_3 __attribute__ ((aligned(16)));
__m128i  y0_p_3_5 __attribute__ ((aligned(16)));
__m128i  y0_p_3_7 __attribute__ ((aligned(16)));
__m128i  y0_p_5_1 __attribute__ ((aligned(16)));
__m128i  y0_p_5_3 __attribute__ ((aligned(16)));
__m128i  y0_p_5_5 __attribute__ ((aligned(16)));
__m128i  y0_p_5_7 __attribute__ ((aligned(16)));
__m128i  y0_p_7_1 __attribute__ ((aligned(16)));
__m128i  y0_p_7_3 __attribute__ ((aligned(16)));
__m128i  y0_p_7_5 __attribute__ ((aligned(16)));
__m128i  y0_p_7_7 __attribute__ ((aligned(16)));
__m128i  y0_m_1_1 __attribute__ ((aligned(16)));
__m128i  y0_m_1_3 __attribute__ ((aligned(16)));
__m128i  y0_m_1_5 __attribute__ ((aligned(16)));
__m128i  y0_m_1_7 __attribute__ ((aligned(16)));
__m128i  y0_m_3_1 __attribute__ ((aligned(16)));
__m128i  y0_m_3_3 __attribute__ ((aligned(16)));
__m128i  y0_m_3_5 __attribute__ ((aligned(16)));
__m128i  y0_m_3_7 __attribute__ ((aligned(16)));
__m128i  y0_m_5_1 __attribute__ ((aligned(16)));
__m128i  y0_m_5_3 __attribute__ ((aligned(16)));
__m128i  y0_m_5_5 __attribute__ ((aligned(16)));
__m128i  y0_m_5_7 __attribute__ ((aligned(16)));
__m128i  y0_m_7_1 __attribute__ ((aligned(16)));
__m128i  y0_m_7_3 __attribute__ ((aligned(16)));
__m128i  y0_m_7_5 __attribute__ ((aligned(16)));
__m128i  y0_m_7_7 __attribute__ ((aligned(16)));

__m128i  xmm0 __attribute__ ((aligned(16)));
__m128i  xmm1 __attribute__ ((aligned(16)));
__m128i  xmm2 __attribute__ ((aligned(16)));
__m128i  xmm3 __attribute__ ((aligned(16)));
__m128i  xmm4 __attribute__ ((aligned(16)));
__m128i  xmm5 __attribute__ ((aligned(16)));
__m128i  xmm6 __attribute__ ((aligned(16)));
__m128i  xmm7 __attribute__ ((aligned(16)));
__m128i  xmm8 __attribute__ ((aligned(16)));

__m128i  y0r __attribute__ ((aligned(16)));
__m128i  y0i __attribute__ ((aligned(16)));
__m128i  y1r __attribute__ ((aligned(16)));
__m128i  y1i __attribute__ ((aligned(16)));
__m128i  y2r __attribute__ ((aligned(16)));
__m128i  y2i __attribute__ ((aligned(16)));

__m128i  logmax_num_re0 __attribute__ ((aligned(16)));
__m128i  logmax_num_im0 __attribute__ ((aligned(16)));
__m128i  logmax_den_re0 __attribute__ ((aligned(16)));
__m128i  logmax_den_im0 __attribute__ ((aligned(16)));
__m128i  logmax_num_re1 __attribute__ ((aligned(16)));
__m128i  logmax_num_im1 __attribute__ ((aligned(16)));
__m128i  logmax_den_re1 __attribute__ ((aligned(16)));
__m128i  logmax_den_im1 __attribute__ ((aligned(16)));

__m128i tmp_result  __attribute__ ((aligned(16)));
__m128i tmp_result2 __attribute__ ((aligned(16)));
__m128i tmp_result3 __attribute__ ((aligned(16)));
__m128i tmp_result4 __attribute__ ((aligned(16)));


//==============================================================================================
// Auxiliary Makros

// calculates psi_a = psi_r*a_r + psi_i*a_i
#define prodsum_psi_a_epi16(psi_r,a_r,psi_i,a_i,psi_a) tmp_result = _mm_mulhi_epi16(psi_r,a_r); tmp_result = _mm_slli_epi16(tmp_result,1); tmp_result2 = _mm_mulhi_epi16(psi_i,a_i); tmp_result2 = _mm_slli_epi16(tmp_result2,1); psi_a = _mm_adds_epi16(tmp_result,tmp_result2);

// calculate interference magnitude
#define interference_abs_epi16(psi,int_ch_mag,int_mag,c1,c2) tmp_result = _mm_cmplt_epi16(psi,int_ch_mag); tmp_result2 = _mm_xor_si128(tmp_result,(*(__m128i*)&nr_ones[0])); tmp_result = _mm_and_si128(tmp_result,c1); tmp_result2 = _mm_and_si128(tmp_result2,c2); int_mag = _mm_or_si128(tmp_result,tmp_result2);

// calculate interference magnitude
// tmp_result = nr_ones in shorts corr. to interval 2<=x<=4, tmp_result2 interval < 2, tmp_result3 interval 4<x<6 and tmp_result4 interval x>6
#define interference_abs_64qam_epi16(psi,int_ch_mag,int_two_ch_mag,int_three_ch_mag,a,c1,c3,c5,c7) tmp_result = _mm_cmplt_epi16(psi,int_two_ch_mag); tmp_result3 = _mm_xor_si128(tmp_result,(*(__m128i*)&nr_ones[0])); tmp_result2 = _mm_cmplt_epi16(psi,int_ch_mag); tmp_result = _mm_xor_si128(tmp_result,tmp_result2); tmp_result4 = _mm_cmpgt_epi16(psi,int_three_ch_mag); tmp_result3 = _mm_xor_si128(tmp_result3,tmp_result4); tmp_result = _mm_and_si128(tmp_result,c3); tmp_result2 = _mm_and_si128(tmp_result2,c1); tmp_result3 = _mm_and_si128(tmp_result3,c5); tmp_result4 = _mm_and_si128(tmp_result4,c7); tmp_result = _mm_or_si128(tmp_result,tmp_result2); tmp_result3 = _mm_or_si128(tmp_result3,tmp_result4); a = _mm_or_si128(tmp_result,tmp_result3);

// calculates a_sq = int_ch_mag*(a_r^2 + a_i^2)*scale_factor
#define square_a_epi16(a_r,a_i,int_ch_mag,scale_factor,a_sq) tmp_result = _mm_mulhi_epi16(a_r,a_r); tmp_result = _mm_slli_epi16(tmp_result,1); tmp_result = _mm_mulhi_epi16(tmp_result,scale_factor); tmp_result = _mm_slli_epi16(tmp_result,1); tmp_result = _mm_mulhi_epi16(tmp_result,int_ch_mag); tmp_result = _mm_slli_epi16(tmp_result,1); tmp_result2 = _mm_mulhi_epi16(a_i,a_i); tmp_result2 = _mm_slli_epi16(tmp_result2,1); tmp_result2 = _mm_mulhi_epi16(tmp_result2,scale_factor); tmp_result2 = _mm_slli_epi16(tmp_result2,1); tmp_result2 = _mm_mulhi_epi16(tmp_result2,int_ch_mag); tmp_result2 = _mm_slli_epi16(tmp_result2,1); a_sq = _mm_adds_epi16(tmp_result,tmp_result2);

// calculates a_sq = int_ch_mag*(a_r^2 + a_i^2)*scale_factor for 64-QAM
#define square_a_64qam_epi16(a_r,a_i,int_ch_mag,scale_factor,a_sq)  tmp_result = _mm_mulhi_epi16(a_r,a_r); tmp_result = _mm_slli_epi16(tmp_result,1); tmp_result = _mm_mulhi_epi16(tmp_result,scale_factor); tmp_result = _mm_slli_epi16(tmp_result,3); tmp_result = _mm_mulhi_epi16(tmp_result,int_ch_mag); tmp_result = _mm_slli_epi16(tmp_result,1); tmp_result2 = _mm_mulhi_epi16(a_i,a_i); tmp_result2 = _mm_slli_epi16(tmp_result2,1); tmp_result2 = _mm_mulhi_epi16(tmp_result2,scale_factor); tmp_result2 = _mm_slli_epi16(tmp_result2,3); tmp_result2 = _mm_mulhi_epi16(tmp_result2,int_ch_mag); tmp_result2 = _mm_slli_epi16(tmp_result2,1); a_sq = _mm_adds_epi16(tmp_result,tmp_result2);

#elif defined(__arm__)

#endif

//==============================================================================================
// SINGLE-STREAM
//==============================================================================================

//----------------------------------------------------------------------------------------------
// QPSK
//----------------------------------------------------------------------------------------------

int nr_dlsch_qpsk_llr(NR_DL_FRAME_PARMS *frame_parms,
                   int32_t **rxdataF_comp,
                   int16_t *dlsch_llr,
                   uint8_t symbol,
				   uint32_t len,
				   uint8_t first_symbol_flag,
                   uint16_t nb_rb,
                   uint8_t beamforming_mode)
{

  uint32_t *rxF = (uint32_t*)&rxdataF_comp[0][((int32_t)symbol*nb_rb*12)];
  uint32_t *llr32;
  int i;

  llr32 = (uint32_t*)dlsch_llr;
  if (!llr32) {
    LOG_E(PHY,"nr_dlsch_qpsk_llr: llr is null, symbol %d, llr32=%p\n",symbol, llr32);
    return(-1);
  }

  /*
  LOG_I(PHY,"dlsch_qpsk_llr: [symb %d / Length %d]: @LLR Buff %x, @LLR Buff(symb) %x \n",
             symbol,
             len,
             dlsch_llr,
             llr32);
  */
  for (i=0; i<len; i++) {
    *llr32 = *rxF;
     //printf("dlsch_qpsk_llr %d : (%d,%d)\n",i,((int16_t*)llr32)[0],((int16_t*)llr32)[1]);
    rxF++;
    llr32++;
  }
  return(0);
}


//----------------------------------------------------------------------------------------------
// 16-QAM
//----------------------------------------------------------------------------------------------

void nr_dlsch_16qam_llr(NR_DL_FRAME_PARMS *frame_parms,
                     int32_t **rxdataF_comp,
                     int16_t *dlsch_llr,
                     int32_t **dl_ch_mag,
                     uint8_t symbol,
					 uint32_t len,
                     uint8_t first_symbol_flag,
                     uint16_t nb_rb,
                     int16_t **llr32p,
                     uint8_t beamforming_mode)
{

#if defined(__x86_64__) || defined(__i386__)
  __m128i *rxF = (__m128i*)&rxdataF_comp[0][(symbol*nb_rb*12)];
  __m128i *ch_mag;
  __m128i llr128[2];
  uint32_t *llr32;
#elif defined(__arm__)
  int16x8_t *rxF = (int16x8_t*)&rxdataF_comp[0][(symbol*nb_rb*12)];
  int16x8_t *ch_mag;
  int16x8_t xmm0;
  int16_t *llr16;
#endif


  int i;
  unsigned char len_mod4=0;


#if defined(__x86_64__) || defined(__i386__)
  if (first_symbol_flag==1) {
    llr32 = (uint32_t*)dlsch_llr;
  } else {
    llr32 = (uint32_t*)*llr32p;
  }
#elif defined(__arm__)
  if (first_symbol_flag==1) {
    llr16 = (int16_t*)dlsch_llr;
  } else {
    llr16 = (int16_t*)*llr32p;
  }
#endif

#if defined(__x86_64__) || defined(__i386__)
  ch_mag = (__m128i*)&dl_ch_mag[0][(symbol*nb_rb*12)];
#elif defined(__arm__)
  ch_mag = (int16x8_t*)&dl_ch_mag[0][(symbol*nb_rb*12)];
#endif

  // update output pointer according to number of REs in this symbol (<<2 because 4 bits per RE)
  if (first_symbol_flag == 1)
    *llr32p = dlsch_llr + (len<<2);
  else
    *llr32p += (len<<2);

 // printf("len=%d\n", len);
  len_mod4 = len&3;
 // printf("len_mod4=%d\n", len_mod4);
  len>>=2;  // length in quad words (4 REs)
 // printf("len>>=2=%d\n", len);
  len+=(len_mod4==0 ? 0 : 1);
 // printf("len+=%d\n", len);
  for (i=0; i<len; i++) {

#if defined(__x86_64__) || defined(__i386)
    xmm0 = _mm_abs_epi16(rxF[i]);
    xmm0 = _mm_subs_epi16(ch_mag[i],xmm0);

    // lambda_1=y_R, lambda_2=|y_R|-|h|^2, lamda_3=y_I, lambda_4=|y_I|-|h|^2
    llr128[0] = _mm_unpacklo_epi32(rxF[i],xmm0);
    llr128[1] = _mm_unpackhi_epi32(rxF[i],xmm0);
    llr32[0] = _mm_extract_epi32(llr128[0],0); //((uint32_t *)&llr128[0])[0];
    llr32[1] = _mm_extract_epi32(llr128[0],1); //((uint32_t *)&llr128[0])[1];
    llr32[2] = _mm_extract_epi32(llr128[0],2); //((uint32_t *)&llr128[0])[2];
    llr32[3] = _mm_extract_epi32(llr128[0],3); //((uint32_t *)&llr128[0])[3];
    llr32[4] = _mm_extract_epi32(llr128[1],0); //((uint32_t *)&llr128[1])[0];
    llr32[5] = _mm_extract_epi32(llr128[1],1); //((uint32_t *)&llr128[1])[1];
    llr32[6] = _mm_extract_epi32(llr128[1],2); //((uint32_t *)&llr128[1])[2];
    llr32[7] = _mm_extract_epi32(llr128[1],3); //((uint32_t *)&llr128[1])[3];
    llr32+=8;
#elif defined(__arm__)
    xmm0 = vabsq_s16(rxF[i]);
    xmm0 = vqsubq_s16(ch_mag[i],xmm0);
    // lambda_1=y_R, lambda_2=|y_R|-|h|^2, lamda_3=y_I, lambda_4=|y_I|-|h|^2

    llr16[0] = vgetq_lane_s16(rxF[i],0);
    llr16[1] = vgetq_lane_s16(rxF[i],1);
    llr16[2] = vgetq_lane_s16(xmm0,0);
    llr16[3] = vgetq_lane_s16(xmm0,1);
    llr16[4] = vgetq_lane_s16(rxF[i],2);
    llr16[5] = vgetq_lane_s16(rxF[i],3);
    llr16[6] = vgetq_lane_s16(xmm0,2);
    llr16[7] = vgetq_lane_s16(xmm0,3);
    llr16[8] = vgetq_lane_s16(rxF[i],4);
    llr16[9] = vgetq_lane_s16(rxF[i],5);
    llr16[10] = vgetq_lane_s16(xmm0,4);
    llr16[11] = vgetq_lane_s16(xmm0,5);
    llr16[12] = vgetq_lane_s16(rxF[i],6);
    llr16[13] = vgetq_lane_s16(rxF[i],6);
    llr16[14] = vgetq_lane_s16(xmm0,7);
    llr16[15] = vgetq_lane_s16(xmm0,7);
    llr16+=16;
#endif

  }

#if defined(__x86_64__) || defined(__i386__)
  _mm_empty();
  _m_empty();
#endif
}

//----------------------------------------------------------------------------------------------
// 64-QAM
//----------------------------------------------------------------------------------------------

void nr_dlsch_64qam_llr(NR_DL_FRAME_PARMS *frame_parms,
			int32_t **rxdataF_comp,
			int16_t *dlsch_llr,
			int32_t **dl_ch_mag,
			int32_t **dl_ch_magb,
			uint8_t symbol,
			uint32_t len,
			uint8_t first_symbol_flag,
			uint16_t nb_rb,
			uint32_t llr_offset,
			uint8_t beamforming_mode)
{
#if defined(__x86_64__) || defined(__i386__)
  __m128i *rxF = (__m128i*)&rxdataF_comp[0][(symbol*nb_rb*12)];
  __m128i *ch_mag,*ch_magb;
#elif defined(__arm__)
  int16x8_t *rxF = (int16x8_t*)&rxdataF_comp[0][(symbol*nb_rb*12)];
  int16x8_t *ch_mag,*ch_magb,xmm1,xmm2;
#endif
  int i,len2;
  unsigned char len_mod4;
  short *llr;
  int16_t *llr2;
  int8_t *pllr_symbol;

  /*
  if (first_symbol_flag==1)
    llr = dlsch_llr;
  else
    llr = *llr_save;
  */
  llr = dlsch_llr;

  pllr_symbol = (int8_t*)dlsch_llr;
  pllr_symbol += llr_offset;

#if defined(__x86_64__) || defined(__i386__)
  ch_mag = (__m128i*)&dl_ch_mag[0][(symbol*nb_rb*12)];
  ch_magb = (__m128i*)&dl_ch_magb[0][(symbol*nb_rb*12)];
#elif defined(__arm__)
  ch_mag = (int16x8_t*)&dl_ch_mag[0][(symbol*nb_rb*12)];
  ch_magb = (int16x8_t*)&dl_ch_magb[0][(symbol*nb_rb*12)];
#endif

//  printf("nr_dlsch_64qam_llr: symbol %d,nb_rb %d, len %d,pbch_pss_sss_adjust %d\n",symbol,nb_rb,len,pbch_pss_sss_adjust);

/*  LOG_I(PHY,"nr_dlsch_64qam_llr [symb %d / FirstSym %d / Length %d]: @LLR Buff %x \n",
             symbol,
             first_symbol_flag,
             len,
             dlsch_llr,
             pllr_symbol);*/

  llr2 = llr;
  llr += (len*6);

  len_mod4 =len&3;
  len2=len>>2;  // length in quad words (4 REs)
  len2+=((len_mod4==0)?0:1);

  for (i=0; i<len2; i++) {

#if defined(__x86_64__) || defined(__i386__)
    xmm1 = _mm_abs_epi16(rxF[i]);
    xmm1 = _mm_subs_epi16(ch_mag[i],xmm1);
    xmm2 = _mm_abs_epi16(xmm1);
    xmm2 = _mm_subs_epi16(ch_magb[i],xmm2);
#elif defined(__arm__)
    xmm1 = vabsq_s16(rxF[i]);
    xmm1 = vsubq_s16(ch_mag[i],xmm1);
    xmm2 = vabsq_s16(xmm1);
    xmm2 = vsubq_s16(ch_magb[i],xmm2);
#endif
    // loop over all LLRs in quad word (24 coded bits)
    /*
      for (j=0;j<8;j+=2) {
      llr2[0] = ((short *)&rxF[i])[j];
      llr2[1] = ((short *)&rxF[i])[j+1];
      llr2[2] = ((short *)&xmm1)[j];
      llr2[3] = ((short *)&xmm1)[j+1];
      llr2[4] = ((short *)&xmm2)[j];
      llr2[5] = ((short *)&xmm2)[j+1];

     llr2+=6;
      }
    */
    llr2[0] = ((short *)&rxF[i])[0];
    llr2[1] = ((short *)&rxF[i])[1];
#if defined(__x86_64__) || defined(__i386__)
    llr2[2] = _mm_extract_epi16(xmm1,0);
    llr2[3] = _mm_extract_epi16(xmm1,1);//((short *)&xmm1)[j+1];
    llr2[4] = _mm_extract_epi16(xmm2,0);//((short *)&xmm2)[j];
    llr2[5] = _mm_extract_epi16(xmm2,1);//((short *)&xmm2)[j+1];
#elif defined(__arm__)
    llr2[2] = vgetq_lane_s16(xmm1,0);
    llr2[3] = vgetq_lane_s16(xmm1,1);//((short *)&xmm1)[j+1];
    llr2[4] = vgetq_lane_s16(xmm2,0);//((short *)&xmm2)[j];
    llr2[5] = vgetq_lane_s16(xmm2,1);//((short *)&xmm2)[j+1];
#endif

    llr2+=6;
    llr2[0] = ((short *)&rxF[i])[2];
    llr2[1] = ((short *)&rxF[i])[3];
#if defined(__x86_64__) || defined(__i386__)
    llr2[2] = _mm_extract_epi16(xmm1,2);
    llr2[3] = _mm_extract_epi16(xmm1,3);//((short *)&xmm1)[j+1];
    llr2[4] = _mm_extract_epi16(xmm2,2);//((short *)&xmm2)[j];
    llr2[5] = _mm_extract_epi16(xmm2,3);//((short *)&xmm2)[j+1];
#elif defined(__arm__)
    llr2[2] = vgetq_lane_s16(xmm1,2);
    llr2[3] = vgetq_lane_s16(xmm1,3);//((short *)&xmm1)[j+1];
    llr2[4] = vgetq_lane_s16(xmm2,2);//((short *)&xmm2)[j];
    llr2[5] = vgetq_lane_s16(xmm2,3);//((short *)&xmm2)[j+1];
#endif

    llr2+=6;
    llr2[0] = ((short *)&rxF[i])[4];
    llr2[1] = ((short *)&rxF[i])[5];
#if defined(__x86_64__) || defined(__i386__)
    llr2[2] = _mm_extract_epi16(xmm1,4);
    llr2[3] = _mm_extract_epi16(xmm1,5);//((short *)&xmm1)[j+1];
    llr2[4] = _mm_extract_epi16(xmm2,4);//((short *)&xmm2)[j];
    llr2[5] = _mm_extract_epi16(xmm2,5);//((short *)&xmm2)[j+1];
#elif defined(__arm__)
    llr2[2] = vgetq_lane_s16(xmm1,4);
    llr2[3] = vgetq_lane_s16(xmm1,5);//((short *)&xmm1)[j+1];
    llr2[4] = vgetq_lane_s16(xmm2,4);//((short *)&xmm2)[j];
    llr2[5] = vgetq_lane_s16(xmm2,5);//((short *)&xmm2)[j+1];
#endif
    llr2+=6;
    llr2[0] = ((short *)&rxF[i])[6];
    llr2[1] = ((short *)&rxF[i])[7];
#if defined(__x86_64__) || defined(__i386__)
    llr2[2] = _mm_extract_epi16(xmm1,6);
    llr2[3] = _mm_extract_epi16(xmm1,7);//((short *)&xmm1)[j+1];
    llr2[4] = _mm_extract_epi16(xmm2,6);//((short *)&xmm2)[j];
    llr2[5] = _mm_extract_epi16(xmm2,7);//((short *)&xmm2)[j+1];
#elif defined(__arm__)
    llr2[2] = vgetq_lane_s16(xmm1,6);
    llr2[3] = vgetq_lane_s16(xmm1,7);//((short *)&xmm1)[j+1];
    llr2[4] = vgetq_lane_s16(xmm2,6);//((short *)&xmm2)[j];
    llr2[5] = vgetq_lane_s16(xmm2,7);//((short *)&xmm2)[j+1];
#endif
    llr2+=6;

  }

#if defined(__x86_64__) || defined(__i386__)
  _mm_empty();
  _m_empty();
#endif
}

//#if 0
void nr_dlsch_64qam_llr_SIC(NR_DL_FRAME_PARMS *frame_parms,
                         int32_t **rxdataF_comp,
                         int32_t **sic_buffer,  //Q15
                         int32_t **rho_i,
                         int16_t *dlsch_llr,
                         uint8_t num_pdcch_symbols,
                         int32_t **dl_ch_mag,
                         int32_t **dl_ch_magb,
                         uint16_t nb_rb,
                         uint8_t subframe,
                         uint16_t mod_order_0,
                         uint32_t rb_alloc)
{
  int16_t rho_amp_x0[2*frame_parms->N_RB_DL*12];
  int16_t rho_rho_amp_x0[2*frame_parms->N_RB_DL*12];
  uint16_t amp_tmp;
  uint16_t *llr32=(uint16_t*)dlsch_llr;
  int i, len,  nsymb, len2;
  uint8_t symbol, symbol_mod;
  int len_acc=0;
  uint16_t *sic_data;
  uint16_t pbch_pss_sss_adjust;
  unsigned char len_mod4=0;
  uint16_t *llr2;
  __m128i *ch_mag,*ch_magb;

  nsymb = (frame_parms->Ncp==0) ? 14:12;

  for (symbol=num_pdcch_symbols; symbol<nsymb; symbol++) {
    uint16_t *rxF = (uint16_t*)(&rxdataF_comp[0][((int16_t)symbol*frame_parms->N_RB_DL*12)]);
    int16_t *rho_1=(int16_t*)(&rho_i[0][((int16_t)symbol*frame_parms->N_RB_DL*12)]);
    ch_mag = (__m128i*)(&dl_ch_mag[0][((int16_t)symbol*frame_parms->N_RB_DL*12)]);
    ch_magb = (__m128i*)(&dl_ch_magb[0][((int16_t)symbol*frame_parms->N_RB_DL*12)]);
    sic_data = (uint16_t*)(&sic_buffer[0][((int16_t)len_acc)]);

    symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol;

    pbch_pss_sss_adjust=adjust_G2(frame_parms,&rb_alloc,6,subframe,symbol);

    if ((symbol_mod==0) || (symbol_mod==(4-frame_parms->Ncp))) {
      amp_tmp = 0x1fff;//dlsch0->sqrt_rho_b; already taken into account
      if (frame_parms->nb_antenna_ports_gNB!=1)
        len = nb_rb*8 - (2*pbch_pss_sss_adjust/3);
      else
        len = nb_rb*10 - (5*pbch_pss_sss_adjust/6);
      } else {
        amp_tmp = 0x1fff; //dlsch0->sqrt_rho_a; already taken into account
        len = nb_rb*12 - pbch_pss_sss_adjust;
      }

    if (mod_order_0==6)
      amp_tmp=amp_tmp<<1; // to compensate for >> 1 shift in modulation

    len_acc+=len;

    multadd_complex_vector_real_scalar((int16_t *)sic_data,
                                        amp_tmp,
                                        (int16_t *)rho_amp_x0, //this is in Q13
                                        1,
                                        len);

    mult_cpx_vector((int16_t *)rho_1, //Q15
                    (int16_t *)rho_amp_x0, //Q13
                    (int16_t*)rho_rho_amp_x0,
                    len,
                    13);

    sub_cpx_vector16((int16_t *)rxF,
                      (int16_t *)rho_rho_amp_x0,
                      //(int16_t *)clean_x1,
                      (int16_t *)rxF,
                      len*2);

    llr2 = llr32;
    llr32 += (len*6);

    len_mod4 =len&3;
    len2=len>>2;  // length in quad words (4 REs)
    len2+=(len_mod4?0:1);



    for (i=0; i<len2; i++) {

      __m128i *x1 = (__m128i*)rxF;
      xmm1 = _mm_abs_epi16(x1[i]);
      xmm1 = _mm_subs_epi16(ch_mag[i],xmm1);
      xmm2 = _mm_abs_epi16(xmm1);
      xmm2 = _mm_subs_epi16(ch_magb[i],xmm2);

      // loop over all LLRs in quad word (24 coded bits)
      /*
        for (j=0;j<8;j+=2) {
        llr2[0] = ((short *)&rxF[i])[j];
        llr2[1] = ((short *)&rxF[i])[j+1];
        llr2[2] = ((short *)&xmm1)[j];
        llr2[3] = ((short *)&xmm1)[j+1];
        llr2[4] = ((short *)&xmm2)[j];
        llr2[5] = ((short *)&xmm2)[j+1];

       llr2+=6;
        }
      */
      llr2[0] = ((short *)&x1[i])[0];
      llr2[1] = ((short *)&x1[i])[1];
      llr2[2] = _mm_extract_epi16(xmm1,0);
      llr2[3] = _mm_extract_epi16(xmm1,1);//((short *)&xmm1)[j+1];
      llr2[4] = _mm_extract_epi16(xmm2,0);//((short *)&xmm2)[j];
      llr2[5] = _mm_extract_epi16(xmm2,1);//((short *)&xmm2)[j+1];


      llr2+=6;
      llr2[0] = ((short *)&x1[i])[2];
      llr2[1] = ((short *)&x1[i])[3];

      llr2[2] = _mm_extract_epi16(xmm1,2);
      llr2[3] = _mm_extract_epi16(xmm1,3);//((short *)&xmm1)[j+1];
      llr2[4] = _mm_extract_epi16(xmm2,2);//((short *)&xmm2)[j];
      llr2[5] = _mm_extract_epi16(xmm2,3);//((short *)&xmm2)[j+1];

      llr2+=6;
      llr2[0] = ((short *)&x1[i])[4];
      llr2[1] = ((short *)&x1[i])[5];

      llr2[2] = _mm_extract_epi16(xmm1,4);
      llr2[3] = _mm_extract_epi16(xmm1,5);//((short *)&xmm1)[j+1];
      llr2[4] = _mm_extract_epi16(xmm2,4);//((short *)&xmm2)[j];
      llr2[5] = _mm_extract_epi16(xmm2,5);//((short *)&xmm2)[j+1];

      llr2+=6;
      llr2[0] = ((short *)&x1[i])[6];
      llr2[1] = ((short *)&x1[i])[7];

      llr2[2] = _mm_extract_epi16(xmm1,6);
      llr2[3] = _mm_extract_epi16(xmm1,7);//((short *)&xmm1)[j+1];
      llr2[4] = _mm_extract_epi16(xmm2,6);//((short *)&xmm2)[j];
      llr2[5] = _mm_extract_epi16(xmm2,7);//((short *)&xmm2)[j+1];

      llr2+=6;

    }

 // *llr_save = llr;

  _mm_empty();
  _m_empty();

  }
}
//#endif
//==============================================================================================
// DUAL-STREAM
//==============================================================================================

//----------------------------------------------------------------------------------------------
// QPSK
//----------------------------------------------------------------------------------------------

#if defined(__x86_64__) || defined(__i386)
__m128i  y0r_over2 __attribute__ ((aligned(16)));
__m128i  y0i_over2 __attribute__ ((aligned(16)));
__m128i  y1r_over2 __attribute__ ((aligned(16)));
__m128i  y1i_over2 __attribute__ ((aligned(16)));

__m128i  A __attribute__ ((aligned(16)));
__m128i  B __attribute__ ((aligned(16)));
__m128i  C __attribute__ ((aligned(16)));
__m128i  D __attribute__ ((aligned(16)));
__m128i  E __attribute__ ((aligned(16)));
__m128i  F __attribute__ ((aligned(16)));
__m128i  G __attribute__ ((aligned(16)));
__m128i  H __attribute__ ((aligned(16)));

#endif

int nr_dlsch_qpsk_qpsk_llr(NR_DL_FRAME_PARMS *frame_parms,
                        int **rxdataF_comp,
                        int **rxdataF_comp_i,
                        int **rho_i,
                        short *dlsch_llr,
                        unsigned char symbol,
						uint32_t len,
                        unsigned char first_symbol_flag,
                        unsigned short nb_rb,
                        uint16_t pbch_pss_sss_adjust,
                        short **llr16p)
{

  int16_t *rxF=(int16_t*)&rxdataF_comp[0][(symbol*nb_rb*12)];
  int16_t *rxF_i=(int16_t*)&rxdataF_comp_i[0][(symbol*nb_rb*12)];
  int16_t *rho=(int16_t*)&rho_i[0][(symbol*nb_rb*12)];
  int16_t *llr16;

  if (first_symbol_flag == 1) {
    llr16 = (int16_t*)dlsch_llr;
  } else {
    llr16 = (int16_t*)(*llr16p);
  }

  AssertFatal(llr16!=NULL,"nr_dlsch_qpsk_qpsk_llr: llr is null, symbol %d\n",symbol);

  // printf("nr_dlsch_qpsk_qpsk_llr: symbol %d,nb_rb %d, len %d,pbch_pss_sss_adjust %d\n",symbol,nb_rb,len,pbch_pss_sss_adjust);
  //    printf("qpsk_qpsk: len %d, llr16 %p\n",len,llr16);
  nr_qpsk_qpsk((short *)rxF,
            (short *)rxF_i,
            (short *)llr16,
            (short *)rho,
            len);

  llr16 += (len<<1);
  *llr16p = (short *)llr16;

  return(0);
}

//__m128i ONE_OVER_SQRT_8 __attribute__((aligned(16)));

void nr_qpsk_qpsk(short *stream0_in,
               short *stream1_in,
               short *stream0_out,
               short *rho01,
               int length
         )
{

  /*
    This function computes the LLRs of stream 0 (s_0) in presence of the interfering stream 1 (s_1) assuming that both symbols are QPSK. It can be used for both MU-MIMO interference-aware receiver or for SU-MIMO receivers.

    Parameters:
    stream0_in = Matched filter output y0' = (h0*g0)*y0
    stream1_in = Matched filter output y1' = (h0*g1)*y0
    stream0_out = LLRs
    rho01 = Correlation between the two effective channels \rho_{10} = (h1*g1)*(h0*g0)
    length = number of resource elements
  */

#if defined(__x86_64__) || defined(__i386__)
  __m128i *rho01_128i = (__m128i *)rho01;
  __m128i *stream0_128i_in = (__m128i *)stream0_in;
  __m128i *stream1_128i_in = (__m128i *)stream1_in;
  __m128i *stream0_128i_out = (__m128i *)stream0_out;
  __m128i ONE_OVER_SQRT_8 = _mm_set1_epi16(23170); //round(2^16/sqrt(8))
#elif defined(__arm__)
  int16x8_t *rho01_128i = (int16x8_t *)rho01;
  int16x8_t *stream0_128i_in = (int16x8_t *)stream0_in;
  int16x8_t *stream1_128i_in = (int16x8_t *)stream1_in;
  int16x8_t *stream0_128i_out = (int16x8_t *)stream0_out;
  int16x8_t ONE_OVER_SQRT_8 = vdupq_n_s16(23170); //round(2^16/sqrt(8))
#endif

  int i;


  for (i=0; i<length>>2; i+=2) {
    // in each iteration, we take 8 complex samples
#if defined(__x86_64__) || defined(__i386__)
    xmm0 = rho01_128i[i]; // 4 symbols
    xmm1 = rho01_128i[i+1];

    // put (rho_r + rho_i)/2sqrt2 in rho_rpi
    // put (rho_r - rho_i)/2sqrt2 in rho_rmi

    xmm0 = _mm_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    //xmm0 = [Re(0,1) Re(2,3) Im(0,1) Im(2,3)]
    //xmm1 = [Re(4,5) Re(6,7) Im(4,5) Im(6,7)]
    xmm2 = _mm_unpacklo_epi64(xmm0,xmm1); // Re(rho)
    xmm3 = _mm_unpackhi_epi64(xmm0,xmm1); // Im(rho)
    rho_rpi = _mm_adds_epi16(xmm2,xmm3); // rho = Re(rho) + Im(rho)
    rho_rmi = _mm_subs_epi16(xmm2,xmm3); // rho* = Re(rho) - Im(rho)

    // divide by sqrt(8), no shift needed ONE_OVER_SQRT_8 = Q1.16
    rho_rpi = _mm_mulhi_epi16(rho_rpi,ONE_OVER_SQRT_8);
    rho_rmi = _mm_mulhi_epi16(rho_rmi,ONE_OVER_SQRT_8);
#elif defined(__arm__)


#endif
    // Compute LLR for first bit of stream 0

    // Compute real and imaginary parts of MF output for stream 0
#if defined(__x86_64__) || defined(__i386__)
    xmm0 = stream0_128i_in[i];
    xmm1 = stream0_128i_in[i+1];

    xmm0 = _mm_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    //xmm0 = [Re(0,1) Re(2,3) Im(0,1) Im(2,3)]
    //xmm1 = [Re(4,5) Re(6,7) Im(4,5) Im(6,7)]
    y0r = _mm_unpacklo_epi64(xmm0,xmm1); // = [y0r(1),y0r(2),y0r(3),y0r(4)]
    y0i = _mm_unpackhi_epi64(xmm0,xmm1);

    y0r_over2  = _mm_srai_epi16(y0r,1);   // divide by 2
    y0i_over2  = _mm_srai_epi16(y0i,1);   // divide by 2
#elif defined(__arm__)


#endif
    // Compute real and imaginary parts of MF output for stream 1
#if defined(__x86_64__) || defined(__i386__)
    xmm0 = stream1_128i_in[i];
    xmm1 = stream1_128i_in[i+1];

    xmm0 = _mm_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    //xmm0 = [Re(0,1) Re(2,3) Im(0,1) Im(2,3)]
    //xmm1 = [Re(4,5) Re(6,7) Im(4,5) Im(6,7)]
    y1r = _mm_unpacklo_epi64(xmm0,xmm1); //[y1r(1),y1r(2),y1r(3),y1r(4)]
    y1i = _mm_unpackhi_epi64(xmm0,xmm1); //[y1i(1),y1i(2),y1i(3),y1i(4)]

    y1r_over2  = _mm_srai_epi16(y1r,1);   // divide by 2
    y1i_over2  = _mm_srai_epi16(y1i,1);   // divide by 2

    // Compute the terms for the LLR of first bit

    xmm0 = _mm_setzero_si128(); // ZERO

    // 1 term for numerator of LLR
    xmm3 = _mm_subs_epi16(y1r_over2,rho_rpi);
    A = _mm_abs_epi16(xmm3); // A = |y1r/2 - rho/sqrt(8)|
    xmm2 = _mm_adds_epi16(A,y0i_over2); // = |y1r/2 - rho/sqrt(8)| + y0i/2
    xmm3 = _mm_subs_epi16(y1i_over2,rho_rmi);
    B = _mm_abs_epi16(xmm3); // B = |y1i/2 - rho*/sqrt(8)|
    logmax_num_re0 = _mm_adds_epi16(B,xmm2); // = |y1r/2 - rho/sqrt(8)|+|y1i/2 - rho*/sqrt(8)| + y0i/2

    // 2 term for numerator of LLR
    xmm3 = _mm_subs_epi16(y1r_over2,rho_rmi);
    C = _mm_abs_epi16(xmm3); // C = |y1r/2 - rho*/4|
    xmm2 = _mm_subs_epi16(C,y0i_over2); // = |y1r/2 - rho*/4| - y0i/2
    xmm3 = _mm_adds_epi16(y1i_over2,rho_rpi);
    D = _mm_abs_epi16(xmm3); // D = |y1i/2 + rho/4|
    xmm2 = _mm_adds_epi16(xmm2,D); // |y1r/2 - rho*/4| + |y1i/2 + rho/4| - y0i/2
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0,xmm2); // max, numerator done

    // 1 term for denominator of LLR
    xmm3 = _mm_adds_epi16(y1r_over2,rho_rmi);
    E = _mm_abs_epi16(xmm3); // E = |y1r/2 + rho*/4|
    xmm2 = _mm_adds_epi16(E,y0i_over2); // = |y1r/2 + rho*/4| + y0i/2
    xmm3 = _mm_subs_epi16(y1i_over2,rho_rpi);
    F = _mm_abs_epi16(xmm3); // F = |y1i/2 - rho/4|
    logmax_den_re0 = _mm_adds_epi16(F,xmm2); // = |y1r/2 + rho*/4| + |y1i/2 - rho/4| + y0i/2

    // 2 term for denominator of LLR
    xmm3 = _mm_adds_epi16(y1r_over2,rho_rpi);
    G = _mm_abs_epi16(xmm3); // G = |y1r/2 + rho/4|
    xmm2 = _mm_subs_epi16(G,y0i_over2); // = |y1r/2 + rho/4| - y0i/2
    xmm3 = _mm_adds_epi16(y1i_over2,rho_rmi);
    H = _mm_abs_epi16(xmm3); // H = |y1i/2 + rho*/4|
    xmm2 = _mm_adds_epi16(xmm2,H); // = |y1r/2 + rho/4| + |y1i/2 + rho*/4| - y0i/2
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0,xmm2); // max, denominator done

    // Compute the terms for the LLR of first bit

    // 1 term for nominator of LLR
    xmm2 = _mm_adds_epi16(A,y0r_over2);
    logmax_num_im0 = _mm_adds_epi16(B,xmm2); // = |y1r/2 - rho/4| + |y1i/2 - rho*/4| + y0r/2

    // 2 term for nominator of LLR
    xmm2 = _mm_subs_epi16(E,y0r_over2);
    xmm2 = _mm_adds_epi16(xmm2,F); // = |y1r/2 + rho*/4| + |y1i/2 - rho/4| - y0r/2

    logmax_num_im0 = _mm_max_epi16(logmax_num_im0,xmm2); // max, nominator done

    // 1 term for denominator of LLR
    xmm2 = _mm_adds_epi16(C,y0r_over2);
    logmax_den_im0 = _mm_adds_epi16(D,xmm2); // = |y1r/2 - rho*/4| + |y1i/2 + rho/4| - y0r/2

    xmm2 = _mm_subs_epi16(G,y0r_over2);
    xmm2 = _mm_adds_epi16(xmm2,H); // = |y1r/2 + rho/4| + |y1i/2 + rho*/4| - y0r/2

    logmax_den_im0 = _mm_max_epi16(logmax_den_im0,xmm2); // max, denominator done

    // LLR of first bit [L1(1), L1(2), L1(3), L1(4)]
    y0r = _mm_adds_epi16(y0r,logmax_num_re0);
    y0r = _mm_subs_epi16(y0r,logmax_den_re0);

    // LLR of second bit [L2(1), L2(2), L2(3), L2(4)]
    y0i = _mm_adds_epi16(y0i,logmax_num_im0);
    y0i = _mm_subs_epi16(y0i,logmax_den_im0);

    _mm_storeu_si128(&stream0_128i_out[i],_mm_unpacklo_epi16(y0r,y0i)); // = [L1(1), L2(1), L1(2), L2(2)]

    if (i<((length>>1) - 1)) // false if only 2 REs remain
      _mm_storeu_si128(&stream0_128i_out[i+1],_mm_unpackhi_epi16(y0r,y0i));

#elif defined(__x86_64__)

#endif
  }

#if defined(__x86_64__) || defined(__i386__)
  _mm_empty();
  _m_empty();
#endif
}

int nr_dlsch_qpsk_16qam_llr(NR_DL_FRAME_PARMS *frame_parms,
                         int32_t **rxdataF_comp,
                         int32_t **rxdataF_comp_i,
                         int32_t **dl_ch_mag_i, //|h_1|^2*(2/sqrt{10})
                         int32_t **rho_i,
                         int16_t *dlsch_llr,
                         uint8_t symbol,
                         uint8_t first_symbol_flag,
                         uint16_t nb_rb,
                         uint16_t pbch_pss_sss_adjust,
                         int16_t **llr16p)
{

  int16_t *rxF=(int16_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
  int16_t *rxF_i=(int16_t*)&rxdataF_comp_i[0][(symbol*frame_parms->N_RB_DL*12)];
  int16_t *ch_mag_i = (int16_t*)&dl_ch_mag_i[0][(symbol*frame_parms->N_RB_DL*12)];
  int16_t *rho=(int16_t*)&rho_i[0][(symbol*frame_parms->N_RB_DL*12)];
  int16_t *llr16;
  int len;
  uint8_t symbol_mod = (symbol >= (7-frame_parms->Ncp))? (symbol-(7-frame_parms->Ncp)) : symbol;

  if (first_symbol_flag == 1) {
    llr16 = (int16_t*)dlsch_llr;
  } else {
    llr16 = (int16_t*)(*llr16p);
  }

  AssertFatal(llr16!=NULL,"nr_dlsch_qpsk_qpsk_llr: llr is null, symbol %d\n",symbol);


  if ((symbol_mod==0) || (symbol_mod==(4-frame_parms->Ncp))) {
    // if symbol has pilots
    if (frame_parms->nb_antenna_ports_gNB!=1)
      // in 2 antenna ports we have 8 REs per symbol per RB
      len = (nb_rb*8) - (2*pbch_pss_sss_adjust/3);
    else
      // for 1 antenna port we have 10 REs per symbol per RB
      len = (nb_rb*10) - (5*pbch_pss_sss_adjust/6);
  } else {
    // symbol has no pilots
    len = (nb_rb*12) - pbch_pss_sss_adjust;
  }

  nr_qpsk_qam16((short *)rxF,
             (short *)rxF_i,
             (short *)ch_mag_i,
             (short *)llr16,
             (short *)rho,
             len);

  llr16 += (len<<1);
  *llr16p = (short *)llr16;

  return(0);
}

/*
#if defined(__x86_64__) || defined(__i386__)
__m128i ONE_OVER_SQRT_2 __attribute__((aligned(16)));
__m128i ONE_OVER_SQRT_10 __attribute__((aligned(16)));
__m128i THREE_OVER_SQRT_10 __attribute__((aligned(16)));
__m128i ONE_OVER_SQRT_10_Q15 __attribute__((aligned(16)));
__m128i SQRT_10_OVER_FOUR __attribute__((aligned(16)));
__m128i ch_mag_int;
#endif
*/
void nr_qpsk_qam16(int16_t *stream0_in,
                int16_t *stream1_in,
                int16_t *ch_mag_i,
                int16_t *stream0_out,
                int16_t *rho01,
                int32_t length
    )
{
  /*
    This function computes the LLRs of stream 0 (s_0) in presence of the interfering stream 1 (s_1) assuming that both symbols are QPSK. It can be used for both MU-MIMO interference-aware receiver or for SU-MIMO receivers.

    Parameters:
    stream0_in = Matched filter output y0' = (h0*g0)*y0
    stream1_in = Matched filter output y1' = (h0*g1)*y0
    stream0_out = LLRs
    rho01 = Correlation between the two effective channels \rho_{10} = (h1*g1)*(h0*g0)
    length = number of resource elements
  */

#if defined(__x86_64__) || defined(__i386__)
  __m128i *rho01_128i = (__m128i *)rho01;
  __m128i *stream0_128i_in = (__m128i *)stream0_in;
  __m128i *stream1_128i_in = (__m128i *)stream1_in;
  __m128i *stream0_128i_out = (__m128i *)stream0_out;
  __m128i *ch_mag_128i_i    = (__m128i *)ch_mag_i;
  __m128i ONE_OVER_SQRT_2 = _mm_set1_epi16(23170); // round(1/sqrt(2)*2^15)
  __m128i ONE_OVER_SQRT_10_Q15 = _mm_set1_epi16(10362); // round(1/sqrt(10)*2^15)
  __m128i THREE_OVER_SQRT_10 = _mm_set1_epi16(31086); // round(3/sqrt(10)*2^15)
  __m128i SQRT_10_OVER_FOUR = _mm_set1_epi16(25905); // round(sqrt(10)/4*2^15)
  __m128i ch_mag_int __attribute__((aligned(16)));
#elif defined(__arm__)
  int16x8_t *rho01_128i = (int16x8_t *)rho01;
  int16x8_t *stream0_128i_in = (int16x8_t *)stream0_in;
  int16x8_t *stream1_128i_in = (int16x8_t *)stream1_in;
  int16x8_t *stream0_128i_out = (int16x8_t *)stream0_out;
  int16x8_t *ch_mag_128i_i    = (int16x8_t *)ch_mag_i;
  int16x8_t ONE_OVER_SQRT_2 = vdupq_n_s16(23170); // round(1/sqrt(2)*2^15)
  int16x8_t ONE_OVER_SQRT_10_Q15 = vdupq_n_s16(10362); // round(1/sqrt(10)*2^15)
  int16x8_t THREE_OVER_SQRT_10 = vdupq_n_s16(31086); // round(3/sqrt(10)*2^15)
  int16x8_t SQRT_10_OVER_FOUR = vdupq_n_s16(25905); // round(sqrt(10)/4*2^15)
  int16x8_t ch_mag_int __attribute__((aligned(16)));
#endif

#ifdef DEBUG_LLR
  print_shorts2("rho01_128i:\n",rho01_128i);
#endif

  int i;


  for (i=0; i<length>>2; i+=2) {
    // in each iteration, we take 8 complex samples

#if defined(__x86_64__) || defined(__i386__)

    xmm0 = rho01_128i[i]; // 4 symbols
    xmm1 = rho01_128i[i+1];

    // put (rho_r + rho_i)/2sqrt2 in rho_rpi
    // put (rho_r - rho_i)/2sqrt2 in rho_rmi
    xmm0 = _mm_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    //xmm0 = [Re(0,1) Re(2,3) Im(0,1) Im(2,3)]
    //xmm1 = [Re(4,5) Re(6,7) Im(4,5) Im(6,7)]
    xmm2 = _mm_unpacklo_epi64(xmm0,xmm1); // Re(rho)
    xmm3 = _mm_unpackhi_epi64(xmm0,xmm1); // Im(rho)
    rho_rpi = _mm_adds_epi16(xmm2,xmm3); // rho = Re(rho) + Im(rho)
    rho_rmi = _mm_subs_epi16(xmm2,xmm3); // rho* = Re(rho) - Im(rho)

    // divide by sqrt(2)
    rho_rpi = _mm_mulhi_epi16(rho_rpi, ONE_OVER_SQRT_2);
    rho_rmi = _mm_mulhi_epi16(rho_rmi, ONE_OVER_SQRT_2);
    rho_rpi = _mm_slli_epi16(rho_rpi,1);
    rho_rmi = _mm_slli_epi16(rho_rmi,1);

    // Compute LLR for first bit of stream 0

    // Compute real and imaginary parts of MF output for stream 0
    xmm0 = stream0_128i_in[i];
    xmm1 = stream0_128i_in[i+1];

    xmm0 = _mm_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    //xmm0 = [Re(0,1) Re(2,3) Im(0,1) Im(2,3)]
    //xmm1 = [Re(4,5) Re(6,7) Im(4,5) Im(6,7)]
    y0r = _mm_unpacklo_epi64(xmm0,xmm1); // = [y0r(1),y0r(2),y0r(3),y0r(4)]
    y0i = _mm_unpackhi_epi64(xmm0,xmm1);

    // divide by sqrt(2)
    y0r_over2 = _mm_mulhi_epi16(y0r, ONE_OVER_SQRT_2);
    y0i_over2 = _mm_mulhi_epi16(y0i, ONE_OVER_SQRT_2);
    y0r_over2  = _mm_slli_epi16(y0r,1);
    y0i_over2  = _mm_slli_epi16(y0i,1);

    y0_p_1_1 = _mm_adds_epi16(y0r_over2, y0i_over2);
    y0_m_1_1 = _mm_subs_epi16(y0r_over2, y0i_over2);

    // Compute real and imaginary parts of MF output for stream 1
    xmm0 = stream1_128i_in[i];
    xmm1 = stream1_128i_in[i+1];

    xmm0 = _mm_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    //xmm0 = [Re(0,1) Re(2,3) Im(0,1) Im(2,3)]
    //xmm1 = [Re(4,5) Re(6,7) Im(4,5) Im(6,7)]
    y1r = _mm_unpacklo_epi64(xmm0,xmm1); //[y1r(1),y1r(2),y1r(3),y1r(4)]
    y1i = _mm_unpackhi_epi64(xmm0,xmm1); //[y1i(1),y1i(2),y1i(3),y1i(4)]

    xmm0 = _mm_setzero_si128(); // ZERO

    // compute psi
    xmm3 = _mm_subs_epi16(y1r,rho_rpi);
    psi_r_p1_p1 = _mm_abs_epi16(xmm3);
    xmm3 = _mm_subs_epi16(y1i,rho_rmi);
    psi_i_p1_p1 = _mm_abs_epi16(xmm3);
    xmm3 = _mm_subs_epi16(y1r,rho_rmi);
    psi_r_p1_m1 = _mm_abs_epi16(xmm3);
    xmm3 = _mm_adds_epi16(y1i,rho_rpi);
    psi_i_p1_m1 = _mm_abs_epi16(xmm3);
    xmm3 = _mm_adds_epi16(y1r,rho_rmi);
    psi_r_m1_p1 = _mm_abs_epi16(xmm3);
    xmm3 = _mm_subs_epi16(y1i,rho_rpi);
    psi_i_m1_p1 = _mm_abs_epi16(xmm3);
    xmm3 = _mm_adds_epi16(y1r,rho_rpi);
    psi_r_m1_m1 = _mm_abs_epi16(xmm3);
    xmm3 = _mm_adds_epi16(y1i,rho_rmi);
    psi_i_m1_m1 = _mm_abs_epi16(xmm3);

    // Rearrange interfering channel magnitudes
    xmm2 = ch_mag_128i_i[i];
    xmm3 = ch_mag_128i_i[i+1];

    xmm2 = _mm_shufflelo_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm2 = _mm_shufflehi_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm2 = _mm_shuffle_epi32(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shufflelo_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shufflehi_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shuffle_epi32(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));

    ch_mag_int = _mm_unpacklo_epi64(xmm2,xmm3);

    // calculate optimal interference amplitudes
    interference_abs_epi16(psi_r_p1_p1 , ch_mag_int, a_r_p1_p1 , ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p1_p1 , ch_mag_int, a_i_p1_p1 , ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p1_m1 , ch_mag_int, a_r_p1_m1 , ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p1_m1 , ch_mag_int, a_i_p1_m1 , ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m1_p1 , ch_mag_int, a_r_m1_p1 , ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m1_p1 , ch_mag_int, a_i_m1_p1 , ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m1_m1 , ch_mag_int, a_r_m1_m1 , ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m1_m1 , ch_mag_int, a_i_m1_m1 , ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);

    // prodsum
    prodsum_psi_a_epi16(psi_r_p1_p1, a_r_p1_p1, psi_i_p1_p1, a_i_p1_p1, psi_a_p1_p1);
    prodsum_psi_a_epi16(psi_r_p1_m1, a_r_p1_m1, psi_i_p1_m1, a_i_p1_m1, psi_a_p1_m1);
    prodsum_psi_a_epi16(psi_r_m1_p1, a_r_m1_p1, psi_i_m1_p1, a_i_m1_p1, psi_a_m1_p1);
    prodsum_psi_a_epi16(psi_r_m1_m1, a_r_m1_m1, psi_i_m1_m1, a_i_m1_m1, psi_a_m1_m1);

    // squares
    square_a_epi16(a_r_p1_p1, a_i_p1_p1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p1_p1);
    square_a_epi16(a_r_p1_m1, a_i_p1_m1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p1_m1);
    square_a_epi16(a_r_m1_p1, a_i_m1_p1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m1_p1);
    square_a_epi16(a_r_m1_m1, a_i_m1_m1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m1_m1);

    // Computing Metrics
    xmm0 = _mm_subs_epi16(psi_a_p1_p1, a_sq_p1_p1);
    bit_met_p1_p1 = _mm_adds_epi16(xmm0, y0_p_1_1);

    xmm0 = _mm_subs_epi16(psi_a_p1_m1, a_sq_p1_m1);
    bit_met_p1_m1 = _mm_adds_epi16(xmm0, y0_m_1_1);

    xmm0 = _mm_subs_epi16(psi_a_m1_p1, a_sq_m1_p1);
    bit_met_m1_p1 = _mm_subs_epi16(xmm0, y0_m_1_1);

    xmm0 = _mm_subs_epi16(psi_a_m1_m1, a_sq_m1_m1);
    bit_met_m1_m1 = _mm_subs_epi16(xmm0, y0_p_1_1);

    // MSB
    logmax_num_re0 = _mm_max_epi16(bit_met_p1_p1,bit_met_p1_m1); // bit=0
    logmax_den_re0 = _mm_max_epi16(bit_met_m1_p1,bit_met_m1_m1); // bit=1

    y0r = _mm_subs_epi16(logmax_num_re0,logmax_den_re0);

    // LSB
    logmax_num_im0 = _mm_max_epi16(bit_met_p1_p1,bit_met_m1_p1); // bit=0
    logmax_den_im0 = _mm_max_epi16(bit_met_p1_m1,bit_met_m1_m1); // bit=1

    y0i = _mm_subs_epi16(logmax_num_im0,logmax_den_im0);

    stream0_128i_out[i] = _mm_unpacklo_epi16(y0r,y0i); // = [L1(1), L2(1), L1(2), L2(2)]

    if (i<((length>>1) - 1)) // false if only 2 REs remain
      stream0_128i_out[i+1] = _mm_unpackhi_epi16(y0r,y0i);

#elif defined(__arm__)

#endif
  }

#if defined(__x86_64__) || defined(__i386__)
  _mm_empty();
  _m_empty();
#endif
}

int nr_dlsch_qpsk_64qam_llr(NR_DL_FRAME_PARMS *frame_parms,
                         int32_t **rxdataF_comp,
                         int32_t **rxdataF_comp_i,
                         int32_t **dl_ch_mag_i, //|h_1|^2*(2/sqrt{10})
                         int32_t **rho_i,
                         int16_t *dlsch_llr,
                         uint8_t symbol,
                         uint8_t first_symbol_flag,
                         uint16_t nb_rb,
                         uint16_t pbch_pss_sss_adjust,
                         int16_t **llr16p)
{

  int16_t *rxF=(int16_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
  int16_t *rxF_i=(int16_t*)&rxdataF_comp_i[0][(symbol*frame_parms->N_RB_DL*12)];
  int16_t *ch_mag_i = (int16_t*)&dl_ch_mag_i[0][(symbol*frame_parms->N_RB_DL*12)];
  int16_t *rho=(int16_t*)&rho_i[0][(symbol*frame_parms->N_RB_DL*12)];
  int16_t *llr16;
  int len;
  uint8_t symbol_mod = (symbol >= (7-frame_parms->Ncp))? (symbol-(7-frame_parms->Ncp)) : symbol;


  if (first_symbol_flag == 1) {
    llr16 = (int16_t*)dlsch_llr;
  } else {
    llr16 = (int16_t*)(*llr16p);
  }

  AssertFatal(llr16!=NULL,"nr_dlsch_qpsk_qam64_llr: llr is null, symbol %d\n",symbol);

  if ((symbol_mod==0) || (symbol_mod==(4-frame_parms->Ncp))) {
    // if symbol has pilots
    if (frame_parms->nb_antenna_ports_gNB!=1)
      // in 2 antenna ports we have 8 REs per symbol per RB
      len = (nb_rb*8) - (2*pbch_pss_sss_adjust/3);
    else
      // for 1 antenna port we have 10 REs per symbol per RB
      len = (nb_rb*10) - (5*pbch_pss_sss_adjust/6);
  } else {
    // symbol has no pilots
    len = (nb_rb*12) - pbch_pss_sss_adjust;
  }

  nr_qpsk_qam64((short *)rxF,
             (short *)rxF_i,
             (short *)ch_mag_i,
             (short *)llr16,
             (short *)rho,
             len);

  llr16 += (len<<1);
  *llr16p = (short *)llr16;

  return(0);
}
/*
__m128i ONE_OVER_SQRT_2_42 __attribute__((aligned(16)));
__m128i THREE_OVER_SQRT_2_42 __attribute__((aligned(16)));
__m128i FIVE_OVER_SQRT_2_42 __attribute__((aligned(16)));
__m128i SEVEN_OVER_SQRT_2_42 __attribute__((aligned(16)));

__m128i ch_mag_int_with_sigma2 __attribute__((aligned(16)));
__m128i two_ch_mag_int_with_sigma2 __attribute__((aligned(16)));
__m128i three_ch_mag_int_with_sigma2 __attribute__((aligned(16)));
__m128i SQRT_42_OVER_FOUR __attribute__((aligned(16)));
*/
void nr_qpsk_qam64(short *stream0_in,
                short *stream1_in,
                short *ch_mag_i,
                short *stream0_out,
                short *rho01,
                int length
    )
{

  /*
    This function computes the LLRs of stream 0 (s_0) in presence of the interfering stream 1 (s_1) assuming that both symbols are QPSK. It can be used for both MU-MIMO interference-aware receiver or for SU-MIMO receivers.

    Parameters:
    stream0_in = Matched filter output y0' = (h0*g0)*y0
    stream1_in = Matched filter output y1' = (h0*g1)*y0
    stream0_out = LLRs
    rho01 = Correlation between the two effective channels \rho_{10} = (h1*g1)*(h0*g0)
    length = number of resource elements
  */

#if defined(__x86_64__) || defined(__i386__)
  __m128i *rho01_128i = (__m128i *)rho01;
  __m128i *stream0_128i_in = (__m128i *)stream0_in;
  __m128i *stream1_128i_in = (__m128i *)stream1_in;
  __m128i *stream0_128i_out = (__m128i *)stream0_out;
  __m128i *ch_mag_128i_i    = (__m128i *)ch_mag_i;
  __m128i ONE_OVER_SQRT_2 = _mm_set1_epi16(23170); // round(1/sqrt(2)*2^15)
  __m128i ONE_OVER_SQRT_2_42 = _mm_set1_epi16(3575); // round(1/sqrt(2*42)*2^15)
  __m128i THREE_OVER_SQRT_2_42 = _mm_set1_epi16(10726); // round(3/sqrt(2*42)*2^15)
  __m128i FIVE_OVER_SQRT_2_42 = _mm_set1_epi16(17876); // round(5/sqrt(2*42)*2^15)
  __m128i SEVEN_OVER_SQRT_2_42 = _mm_set1_epi16(25027); // round(7/sqrt(2*42)*2^15)
  __m128i SQRT_42_OVER_FOUR = _mm_set1_epi16(13272); // round(sqrt(42)/4*2^13), Q3.1
  __m128i ch_mag_int;
  __m128i ch_mag_int_with_sigma2;
  __m128i two_ch_mag_int_with_sigma2;
  __m128i three_ch_mag_int_with_sigma2;
#elif defined(__arm__)

#endif

#ifdef DEBUG_LLR
  print_shorts2("rho01_128i:\n",rho01_128i);
#endif

  int i;


  for (i=0; i<length>>2; i+=2) {
    // in each iteration, we take 8 complex samples

#if defined(__x86_64__) || defined(__i386__)

    xmm0 = rho01_128i[i]; // 4 symbols
    xmm1 = rho01_128i[i+1];

    // put (rho_r + rho_i)/sqrt2 in rho_rpi
    // put (rho_r - rho_i)/sqrt2 in rho_rmi
    xmm0 = _mm_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    //xmm0 = [Re(0,1) Re(2,3) Im(0,1) Im(2,3)]
    //xmm1 = [Re(4,5) Re(6,7) Im(4,5) Im(6,7)]
    xmm2 = _mm_unpacklo_epi64(xmm0,xmm1); // Re(rho)
    xmm3 = _mm_unpackhi_epi64(xmm0,xmm1); // Im(rho)
    rho_rpi = _mm_adds_epi16(xmm2,xmm3); // rho = Re(rho) + Im(rho)
    rho_rmi = _mm_subs_epi16(xmm2,xmm3); // rho* = Re(rho) - Im(rho)

    // divide by sqrt(2)
    rho_rpi = _mm_mulhi_epi16(rho_rpi, ONE_OVER_SQRT_2);
    rho_rmi = _mm_mulhi_epi16(rho_rmi, ONE_OVER_SQRT_2);
    rho_rpi = _mm_slli_epi16(rho_rpi,1);
    rho_rmi = _mm_slli_epi16(rho_rmi,1);

    // Compute LLR for first bit of stream 0

    // Compute real and imaginary parts of MF output for stream 0
    xmm0 = stream0_128i_in[i];
    xmm1 = stream0_128i_in[i+1];

    xmm0 = _mm_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    //xmm0 = [Re(0,1) Re(2,3) Im(0,1) Im(2,3)]
    //xmm1 = [Re(4,5) Re(6,7) Im(4,5) Im(6,7)]
    y0r = _mm_unpacklo_epi64(xmm0,xmm1); // = [y0r(1),y0r(2),y0r(3),y0r(4)]
    y0i = _mm_unpackhi_epi64(xmm0,xmm1);

    // divide by sqrt(2)
    y0r_over2 = _mm_mulhi_epi16(y0r, ONE_OVER_SQRT_2);
    y0i_over2 = _mm_mulhi_epi16(y0i, ONE_OVER_SQRT_2);
    y0r_over2  = _mm_slli_epi16(y0r,1);
    y0i_over2  = _mm_slli_epi16(y0i,1);

    y0_p_1_1 = _mm_adds_epi16(y0r_over2, y0i_over2);
    y0_m_1_1 = _mm_subs_epi16(y0r_over2, y0i_over2);

    // Compute real and imaginary parts of MF output for stream 1
    xmm0 = stream1_128i_in[i];
    xmm1 = stream1_128i_in[i+1];

    xmm0 = _mm_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    //xmm0 = [Re(0,1) Re(2,3) Im(0,1) Im(2,3)]
    //xmm1 = [Re(4,5) Re(6,7) Im(4,5) Im(6,7)]
    y1r = _mm_unpacklo_epi64(xmm0,xmm1); //[y1r(1),y1r(2),y1r(3),y1r(4)]
    y1i = _mm_unpackhi_epi64(xmm0,xmm1); //[y1i(1),y1i(2),y1i(3),y1i(4)]

    xmm0 = _mm_setzero_si128(); // ZERO

    // compute psi
    xmm3 = _mm_subs_epi16(y1r,rho_rpi);
    psi_r_p1_p1 = _mm_abs_epi16(xmm3);
    xmm3 = _mm_subs_epi16(y1i,rho_rmi);
    psi_i_p1_p1 = _mm_abs_epi16(xmm3);
    xmm3 = _mm_subs_epi16(y1r,rho_rmi);
    psi_r_p1_m1 = _mm_abs_epi16(xmm3);
    xmm3 = _mm_adds_epi16(y1i,rho_rpi);
    psi_i_p1_m1 = _mm_abs_epi16(xmm3);
    xmm3 = _mm_adds_epi16(y1r,rho_rmi);
    psi_r_m1_p1 = _mm_abs_epi16(xmm3);
    xmm3 = _mm_subs_epi16(y1i,rho_rpi);
    psi_i_m1_p1 = _mm_abs_epi16(xmm3);
    xmm3 = _mm_adds_epi16(y1r,rho_rpi);
    psi_r_m1_m1 = _mm_abs_epi16(xmm3);
    xmm3 = _mm_adds_epi16(y1i,rho_rmi);
    psi_i_m1_m1 = _mm_abs_epi16(xmm3);

    // Rearrange interfering channel magnitudes
    xmm2 = ch_mag_128i_i[i];
    xmm3 = ch_mag_128i_i[i+1];

    xmm2 = _mm_shufflelo_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm2 = _mm_shufflehi_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm2 = _mm_shuffle_epi32(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shufflelo_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shufflehi_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shuffle_epi32(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));

    ch_mag_int = _mm_unpacklo_epi64(xmm2,xmm3);
    ch_mag_int_with_sigma2       = _mm_srai_epi16(ch_mag_int, 1); // *2
    two_ch_mag_int_with_sigma2   = ch_mag_int; // *4
    three_ch_mag_int_with_sigma2 = _mm_adds_epi16(ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2); // *6

    interference_abs_64qam_epi16(psi_r_p1_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p1_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p1_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p1_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p1_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p1_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p1_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p1_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m1_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m1_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m1_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m1_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m1_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m1_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m1_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m1_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);

    // prodsum
    prodsum_psi_a_epi16(psi_r_p1_p1, a_r_p1_p1, psi_i_p1_p1, a_i_p1_p1, psi_a_p1_p1);
    prodsum_psi_a_epi16(psi_r_p1_m1, a_r_p1_m1, psi_i_p1_m1, a_i_p1_m1, psi_a_p1_m1);
    prodsum_psi_a_epi16(psi_r_m1_p1, a_r_m1_p1, psi_i_m1_p1, a_i_m1_p1, psi_a_m1_p1);
    prodsum_psi_a_epi16(psi_r_m1_m1, a_r_m1_m1, psi_i_m1_m1, a_i_m1_m1, psi_a_m1_m1);

    // Multiply by sqrt(2)
    psi_a_p1_p1 = _mm_mulhi_epi16(psi_a_p1_p1, ONE_OVER_SQRT_2);
    psi_a_p1_p1 = _mm_slli_epi16(psi_a_p1_p1, 2);
    psi_a_p1_m1 = _mm_mulhi_epi16(psi_a_p1_m1, ONE_OVER_SQRT_2);
    psi_a_p1_m1 = _mm_slli_epi16(psi_a_p1_m1, 2);
    psi_a_m1_p1 = _mm_mulhi_epi16(psi_a_m1_p1, ONE_OVER_SQRT_2);
    psi_a_m1_p1 = _mm_slli_epi16(psi_a_m1_p1, 2);
    psi_a_m1_m1 = _mm_mulhi_epi16(psi_a_m1_m1, ONE_OVER_SQRT_2);
    psi_a_m1_m1 = _mm_slli_epi16(psi_a_m1_m1, 2);

    square_a_64qam_epi16(a_r_p1_p1, a_i_p1_p1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p1_p1);
    square_a_64qam_epi16(a_r_p1_m1, a_i_p1_m1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p1_m1);
    square_a_64qam_epi16(a_r_m1_p1, a_i_m1_p1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m1_p1);
    square_a_64qam_epi16(a_r_m1_m1, a_i_m1_m1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m1_m1);

    // Computing Metrics
    xmm0 = _mm_subs_epi16(psi_a_p1_p1, a_sq_p1_p1);
    bit_met_p1_p1 = _mm_adds_epi16(xmm0, y0_p_1_1);

    xmm0 = _mm_subs_epi16(psi_a_p1_m1, a_sq_p1_m1);
    bit_met_p1_m1 = _mm_adds_epi16(xmm0, y0_m_1_1);

    xmm0 = _mm_subs_epi16(psi_a_m1_p1, a_sq_m1_p1);
    bit_met_m1_p1 = _mm_subs_epi16(xmm0, y0_m_1_1);

    xmm0 = _mm_subs_epi16(psi_a_m1_m1, a_sq_m1_m1);
    bit_met_m1_m1 = _mm_subs_epi16(xmm0, y0_p_1_1);

    // MSB
    logmax_num_re0 = _mm_max_epi16(bit_met_p1_p1,bit_met_p1_m1); // bit=0
    logmax_den_re0 = _mm_max_epi16(bit_met_m1_p1,bit_met_m1_m1); // bit=1

    y0r = _mm_subs_epi16(logmax_num_re0,logmax_den_re0);

    // LSB
    logmax_num_im0 = _mm_max_epi16(bit_met_p1_p1,bit_met_m1_p1); // bit=0
    logmax_den_im0 = _mm_max_epi16(bit_met_p1_m1,bit_met_m1_m1); // bit=1

    y0i = _mm_subs_epi16(logmax_num_im0,logmax_den_im0);

    stream0_128i_out[i] = _mm_unpacklo_epi16(y0r,y0i); // = [L1(1), L2(1), L1(2), L2(2)]

    if (i<((length>>1) - 1)) // false if only 2 REs remain
      stream0_128i_out[i+1] = _mm_unpackhi_epi16(y0r,y0i);

#elif defined(__arm__)

#endif
  }

#if defined(__x86_64__) || defined(__i386__)
  _mm_empty();
  _m_empty();
#endif
}


//----------------------------------------------------------------------------------------------
// 16-QAM
//----------------------------------------------------------------------------------------------

/*
__m128i ONE_OVER_TWO_SQRT_10 __attribute__((aligned(16)));
__m128i NINE_OVER_TWO_SQRT_10 __attribute__((aligned(16)));

__m128i  y0r_over_sqrt10 __attribute__ ((aligned(16)));
__m128i  y0i_over_sqrt10 __attribute__ ((aligned(16)));
__m128i  y0r_three_over_sqrt10 __attribute__ ((aligned(16)));
__m128i  y0i_three_over_sqrt10 __attribute__ ((aligned(16)));

__m128i ch_mag_des __attribute__((aligned(16)));
__m128i ch_mag_over_10 __attribute__ ((aligned(16)));
__m128i ch_mag_over_2 __attribute__ ((aligned(16)));
__m128i ch_mag_9_over_10 __attribute__ ((aligned(16)));
*/

void nr_qam16_qpsk(short *stream0_in,
                short *stream1_in,
                short *ch_mag,
                short *stream0_out,
                short *rho01,
                int length
    )
{

  /*
    Author: Sebastian Wagner
    Date: 2012-06-04

    Input:
    stream0_in:  MF filter for 1st stream, i.e., y0=h0'*y
    stream!_in:  MF filter for 2nd stream, i.e., y1=h1'*y
    ch_mag:      2*h0/sqrt(00), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
    ch_mag_i:    2*h1/sqrt(00), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
    rho01:       Channel cross correlation, i.e., h1'*h0

    Output:
    stream0_out: output LLRs for 1st stream
  */

#if defined(__x86_64__) || defined(__i386__)
  __m128i *rho01_128i       = (__m128i *)rho01;
  __m128i *stream0_128i_in  = (__m128i *)stream0_in;
  __m128i *stream1_128i_in  = (__m128i *)stream1_in;
  __m128i *stream0_128i_out = (__m128i *)stream0_out;
  __m128i *ch_mag_128i      = (__m128i *)ch_mag;
  __m128i ONE_OVER_SQRT_2 = _mm_set1_epi16(23170); // round(1/sqrt(2)*2^15)
  __m128i ONE_OVER_SQRT_10 = _mm_set1_epi16(20724); // round(1/sqrt(10)*2^16)
  __m128i THREE_OVER_SQRT_10 = _mm_set1_epi16(31086); // round(3/sqrt(10)*2^15)
  __m128i SQRT_10_OVER_FOUR = _mm_set1_epi16(25905); // round(sqrt(10)/4*2^15)
  __m128i ONE_OVER_TWO_SQRT_10 = _mm_set1_epi16(10362); // round(1/2/sqrt(10)*2^16)
  __m128i NINE_OVER_TWO_SQRT_10 = _mm_set1_epi16(23315); // round(9/2/sqrt(10)*2^14)
  __m128i  y0r_over_sqrt10;
  __m128i  y0i_over_sqrt10;
  __m128i  y0r_three_over_sqrt10;
  __m128i  y0i_three_over_sqrt10;

  __m128i ch_mag_des;
  __m128i ch_mag_over_10;
  __m128i ch_mag_over_2;
  __m128i ch_mag_9_over_10;
#elif defined(__arm__)

#endif

  int i;


  for (i=0; i<length>>2; i+=2) {
    // In one iteration, we deal with 8 REs

#if defined(__x86_64__) || defined(__i386__)
    // Get rho
    xmm0 = rho01_128i[i];
    xmm1 = rho01_128i[i+1];
    xmm0 = _mm_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    //xmm0 = [Re(0,1) Re(2,3) Im(0,1) Im(2,3)]
    //xmm1 = [Re(4,5) Re(6,7) Im(4,5) Im(6,7)]
    xmm2 = _mm_unpacklo_epi64(xmm0,xmm1); // Re(rho)
    xmm3 = _mm_unpackhi_epi64(xmm0,xmm1); // Im(rho)
    rho_rpi = _mm_adds_epi16(xmm2,xmm3); // rho = Re(rho) + Im(rho)
    rho_rmi = _mm_subs_epi16(xmm2,xmm3); // rho* = Re(rho) - Im(rho)

    // Compute the different rhos
    rho_rpi_1_1 = _mm_mulhi_epi16(rho_rpi,ONE_OVER_SQRT_10);
    rho_rmi_1_1 = _mm_mulhi_epi16(rho_rmi,ONE_OVER_SQRT_10);
    rho_rpi_3_3 = _mm_mulhi_epi16(rho_rpi,THREE_OVER_SQRT_10);
    rho_rmi_3_3 = _mm_mulhi_epi16(rho_rmi,THREE_OVER_SQRT_10);
    rho_rpi_3_3 = _mm_slli_epi16(rho_rpi_3_3,1);
    rho_rmi_3_3 = _mm_slli_epi16(rho_rmi_3_3,1);

    xmm4 = _mm_mulhi_epi16(xmm2,ONE_OVER_SQRT_10); // Re(rho)
    xmm5 = _mm_mulhi_epi16(xmm3,THREE_OVER_SQRT_10); // Im(rho)
    xmm5 = _mm_slli_epi16(xmm5,1);

    rho_rpi_1_3 = _mm_adds_epi16(xmm4,xmm5);
    rho_rmi_1_3 = _mm_subs_epi16(xmm4,xmm5);

    xmm6 = _mm_mulhi_epi16(xmm2,THREE_OVER_SQRT_10); // Re(rho)
    xmm7 = _mm_mulhi_epi16(xmm3,ONE_OVER_SQRT_10); // Im(rho)
    xmm6 = _mm_slli_epi16(xmm6,1);

    rho_rpi_3_1 = _mm_adds_epi16(xmm6,xmm7);
    rho_rmi_3_1 = _mm_subs_epi16(xmm6,xmm7);

    // Rearrange interfering MF output
    xmm0 = stream1_128i_in[i];
    xmm1 = stream1_128i_in[i+1];
    xmm0 = _mm_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    //xmm0 = [Re(0,1) Re(2,3) Im(0,1) Im(2,3)]
    //xmm1 = [Re(4,5) Re(6,7) Im(4,5) Im(6,7)]
    y1r = _mm_unpacklo_epi64(xmm0,xmm1); //[y1r(1),y1r(2),y1r(3),y1r(4)]
    y1i = _mm_unpackhi_epi64(xmm0,xmm1); //[y1i(1),y1i(2),y1i(3),y1i(4)]

    xmm0 = _mm_setzero_si128(); // ZERO
    xmm2 = _mm_subs_epi16(rho_rpi_1_1,y1r); // = [Re(rho)+ Im(rho)]/sqrt(10) - y1r
    psi_r_p1_p1 = _mm_abs_epi16(xmm2); // = |[Re(rho)+ Im(rho)]/sqrt(10) - y1r|

    xmm2= _mm_subs_epi16(rho_rmi_1_1,y1r);
    psi_r_p1_m1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rmi_1_1,y1i);
    psi_i_p1_p1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rpi_1_3,y1r);
    psi_r_p1_p3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rmi_1_3,y1r);
    psi_r_p1_m3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rmi_3_1,y1i);
    psi_i_p1_p3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rpi_3_1,y1r);
    psi_r_p3_p1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rmi_3_1,y1r);
    psi_r_p3_m1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rmi_1_3,y1i);
    psi_i_p3_p1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rpi_3_3,y1r);
    psi_r_p3_p3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rmi_3_3,y1r);
    psi_r_p3_m3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rmi_3_3,y1i);
    psi_i_p3_p3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rpi_1_1,y1i);
    psi_i_m1_p1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rpi_3_1,y1i);
    psi_i_m1_p3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rpi_1_3,y1i);
    psi_i_m3_p1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rpi_3_3,y1i);
    psi_i_m3_p3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(rho_rpi_1_1,y1i);
    psi_i_p1_m1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(rho_rpi_3_1,y1i);
    psi_i_p1_m3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(rho_rpi_1_3,y1i);
    psi_i_p3_m1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(rho_rpi_3_3,y1i);
    psi_i_p3_m3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(rho_rpi_1_1,y1r);
    psi_r_m1_m1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(rho_rpi_1_3,y1r);
    psi_r_m1_m3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(rho_rpi_3_1,y1r);
    psi_r_m3_m1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(rho_rpi_3_3,y1r);
    psi_r_m3_m3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(y1r,rho_rmi_1_1);
    psi_r_m1_p1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(y1r,rho_rmi_1_3);
    psi_r_m1_p3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(y1i,rho_rmi_1_1);
    psi_i_m1_m1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(y1i,rho_rmi_3_1);
    psi_i_m1_m3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(y1r,rho_rmi_3_1);
    psi_r_m3_p1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(y1r,rho_rmi_3_3);
    psi_r_m3_p3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(y1i,rho_rmi_1_3);
    psi_i_m3_m1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(y1i,rho_rmi_3_3);
    psi_i_m3_m3 = _mm_abs_epi16(xmm2);

    // Rearrange desired MF output
    xmm0 = stream0_128i_in[i];
    xmm1 = stream0_128i_in[i+1];
    xmm0 = _mm_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    //xmm0 = [Re(0,1) Re(2,3) Im(0,1) Im(2,3)]
    //xmm1 = [Re(4,5) Re(6,7) Im(4,5) Im(6,7)]
    y0r = _mm_unpacklo_epi64(xmm0,xmm1); // = [y0r(1),y0r(2),y0r(3),y0r(4)]
    y0i = _mm_unpackhi_epi64(xmm0,xmm1);

    // Rearrange desired channel magnitudes
    xmm2 = ch_mag_128i[i]; // = [|h|^2(1),|h|^2(1),|h|^2(2),|h|^2(2)]*(2/sqrt(10))
    xmm3 = ch_mag_128i[i+1]; // = [|h|^2(3),|h|^2(3),|h|^2(4),|h|^2(4)]*(2/sqrt(10))
    xmm2 = _mm_shufflelo_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm2 = _mm_shufflehi_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm2 = _mm_shuffle_epi32(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shufflelo_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shufflehi_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shuffle_epi32(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));

    ch_mag_des = _mm_unpacklo_epi64(xmm2,xmm3); // = [|h|^2(1),|h|^2(2),|h|^2(3),|h|^2(4)]*(2/sqrt(10))

    // Scale MF output of desired signal
    y0r_over_sqrt10 = _mm_mulhi_epi16(y0r,ONE_OVER_SQRT_10);
    y0i_over_sqrt10 = _mm_mulhi_epi16(y0i,ONE_OVER_SQRT_10);
    y0r_three_over_sqrt10 = _mm_mulhi_epi16(y0r,THREE_OVER_SQRT_10);
    y0i_three_over_sqrt10 = _mm_mulhi_epi16(y0i,THREE_OVER_SQRT_10);
    y0r_three_over_sqrt10 = _mm_slli_epi16(y0r_three_over_sqrt10,1);
    y0i_three_over_sqrt10 = _mm_slli_epi16(y0i_three_over_sqrt10,1);

    // Compute necessary combination of required terms
    y0_p_1_1 = _mm_adds_epi16(y0r_over_sqrt10,y0i_over_sqrt10);
    y0_m_1_1 = _mm_subs_epi16(y0r_over_sqrt10,y0i_over_sqrt10);

    y0_p_1_3 = _mm_adds_epi16(y0r_over_sqrt10,y0i_three_over_sqrt10);
    y0_m_1_3 = _mm_subs_epi16(y0r_over_sqrt10,y0i_three_over_sqrt10);

    y0_p_3_1 = _mm_adds_epi16(y0r_three_over_sqrt10,y0i_over_sqrt10);
    y0_m_3_1 = _mm_subs_epi16(y0r_three_over_sqrt10,y0i_over_sqrt10);

    y0_p_3_3 = _mm_adds_epi16(y0r_three_over_sqrt10,y0i_three_over_sqrt10);
    y0_m_3_3 = _mm_subs_epi16(y0r_three_over_sqrt10,y0i_three_over_sqrt10);

    // Add psi
    psi_a_p1_p1 = _mm_adds_epi16(psi_r_p1_p1 ,psi_i_p1_p1);
    psi_a_p1_p3 = _mm_adds_epi16(psi_r_p1_p3 ,psi_i_p1_p3);
    psi_a_p3_p1 = _mm_adds_epi16(psi_r_p3_p1 ,psi_i_p3_p1);
    psi_a_p3_p3 = _mm_adds_epi16(psi_r_p3_p3 ,psi_i_p3_p3);
    psi_a_p1_m1 = _mm_adds_epi16(psi_r_p1_m1 ,psi_i_p1_m1);
    psi_a_p1_m3 = _mm_adds_epi16(psi_r_p1_m3 ,psi_i_p1_m3);
    psi_a_p3_m1 = _mm_adds_epi16(psi_r_p3_m1 ,psi_i_p3_m1);
    psi_a_p3_m3 = _mm_adds_epi16(psi_r_p3_m3 ,psi_i_p3_m3);
    psi_a_m1_p1 = _mm_adds_epi16(psi_r_m1_p1 ,psi_i_m1_p1);
    psi_a_m1_p3 = _mm_adds_epi16(psi_r_m1_p3 ,psi_i_m1_p3);
    psi_a_m3_p1 = _mm_adds_epi16(psi_r_m3_p1 ,psi_i_m3_p1);
    psi_a_m3_p3 = _mm_adds_epi16(psi_r_m3_p3 ,psi_i_m3_p3);
    psi_a_m1_m1 = _mm_adds_epi16(psi_r_m1_m1 ,psi_i_m1_m1);
    psi_a_m1_m3 = _mm_adds_epi16(psi_r_m1_m3 ,psi_i_m1_m3);
    psi_a_m3_m1 = _mm_adds_epi16(psi_r_m3_m1 ,psi_i_m3_m1);
    psi_a_m3_m3 = _mm_adds_epi16(psi_r_m3_m3 ,psi_i_m3_m3);

    // scale by sqrt(2)
    psi_a_p1_p1 = _mm_mulhi_epi16(psi_a_p1_p1,ONE_OVER_SQRT_2);
    psi_a_p1_p1 = _mm_slli_epi16(psi_a_p1_p1,1);
    psi_a_p1_p3 = _mm_mulhi_epi16(psi_a_p1_p3,ONE_OVER_SQRT_2);
    psi_a_p1_p3 = _mm_slli_epi16(psi_a_p1_p3,1);
    psi_a_p3_p1 = _mm_mulhi_epi16(psi_a_p3_p1,ONE_OVER_SQRT_2);
    psi_a_p3_p1 = _mm_slli_epi16(psi_a_p3_p1,1);
    psi_a_p3_p3 = _mm_mulhi_epi16(psi_a_p3_p3,ONE_OVER_SQRT_2);
    psi_a_p3_p3 = _mm_slli_epi16(psi_a_p3_p3,1);

    psi_a_p1_m1 = _mm_mulhi_epi16(psi_a_p1_m1,ONE_OVER_SQRT_2);
    psi_a_p1_m1 = _mm_slli_epi16(psi_a_p1_m1,1);
    psi_a_p1_m3 = _mm_mulhi_epi16(psi_a_p1_m3,ONE_OVER_SQRT_2);
    psi_a_p1_m3 = _mm_slli_epi16(psi_a_p1_m3,1);
    psi_a_p3_m1 = _mm_mulhi_epi16(psi_a_p3_m1,ONE_OVER_SQRT_2);
    psi_a_p3_m1 = _mm_slli_epi16(psi_a_p3_m1,1);
    psi_a_p3_m3 = _mm_mulhi_epi16(psi_a_p3_m3,ONE_OVER_SQRT_2);
    psi_a_p3_m3 = _mm_slli_epi16(psi_a_p3_m3,1);

    psi_a_m1_p1 = _mm_mulhi_epi16(psi_a_m1_p1,ONE_OVER_SQRT_2);
    psi_a_m1_p1 = _mm_slli_epi16(psi_a_m1_p1,1);
    psi_a_m1_p3 = _mm_mulhi_epi16(psi_a_m1_p3,ONE_OVER_SQRT_2);
    psi_a_m1_p3 = _mm_slli_epi16(psi_a_m1_p3,1);
    psi_a_m3_p1 = _mm_mulhi_epi16(psi_a_m3_p1,ONE_OVER_SQRT_2);
    psi_a_m3_p1 = _mm_slli_epi16(psi_a_m3_p1,1);
    psi_a_m3_p3 = _mm_mulhi_epi16(psi_a_m3_p3,ONE_OVER_SQRT_2);
    psi_a_m3_p3 = _mm_slli_epi16(psi_a_m3_p3,1);

    psi_a_m1_m1 = _mm_mulhi_epi16(psi_a_m1_m1,ONE_OVER_SQRT_2);
    psi_a_m1_m1 = _mm_slli_epi16(psi_a_m1_m1,1);
    psi_a_m1_m3 = _mm_mulhi_epi16(psi_a_m1_m3,ONE_OVER_SQRT_2);
    psi_a_m1_m3 = _mm_slli_epi16(psi_a_m1_m3,1);
    psi_a_m3_m1 = _mm_mulhi_epi16(psi_a_m3_m1,ONE_OVER_SQRT_2);
    psi_a_m3_m1 = _mm_slli_epi16(psi_a_m3_m1,1);
    psi_a_m3_m3 = _mm_mulhi_epi16(psi_a_m3_m3,ONE_OVER_SQRT_2);
    psi_a_m3_m3 = _mm_slli_epi16(psi_a_m3_m3,1);

    // Computing different multiples of channel norms
    ch_mag_over_10=_mm_mulhi_epi16(ch_mag_des, ONE_OVER_TWO_SQRT_10);
    ch_mag_over_2=_mm_mulhi_epi16(ch_mag_des, SQRT_10_OVER_FOUR);
    ch_mag_over_2=_mm_slli_epi16(ch_mag_over_2, 1);
    ch_mag_9_over_10=_mm_mulhi_epi16(ch_mag_des, NINE_OVER_TWO_SQRT_10);
    ch_mag_9_over_10=_mm_slli_epi16(ch_mag_9_over_10, 2);

    // Computing Metrics
    xmm1 = _mm_adds_epi16(psi_a_p1_p1, y0_p_1_1);
    bit_met_p1_p1= _mm_subs_epi16(xmm1, ch_mag_over_10);

    xmm1 = _mm_adds_epi16(psi_a_p1_p3, y0_p_1_3);
    bit_met_p1_p3= _mm_subs_epi16(xmm1, ch_mag_over_2);

    xmm1 = _mm_adds_epi16(psi_a_p1_m1, y0_m_1_1);
    bit_met_p1_m1= _mm_subs_epi16(xmm1, ch_mag_over_10);

    xmm1 = _mm_adds_epi16(psi_a_p1_m3, y0_m_1_3);
    bit_met_p1_m3= _mm_subs_epi16(xmm1, ch_mag_over_2);

    xmm1 = _mm_adds_epi16(psi_a_p3_p1, y0_p_3_1);
    bit_met_p3_p1= _mm_subs_epi16(xmm1, ch_mag_over_2);

    xmm1 = _mm_adds_epi16(psi_a_p3_p3, y0_p_3_3);
    bit_met_p3_p3= _mm_subs_epi16(xmm1, ch_mag_9_over_10);

    xmm1 = _mm_adds_epi16(psi_a_p3_m1, y0_m_3_1);
    bit_met_p3_m1= _mm_subs_epi16(xmm1, ch_mag_over_2);

    xmm1 = _mm_adds_epi16(psi_a_p3_m3, y0_m_3_3);
    bit_met_p3_m3= _mm_subs_epi16(xmm1, ch_mag_9_over_10);

    xmm1 = _mm_subs_epi16(psi_a_m1_p1, y0_m_1_1);
    bit_met_m1_p1= _mm_subs_epi16(xmm1, ch_mag_over_10);

    xmm1 = _mm_subs_epi16(psi_a_m1_p3, y0_m_1_3);
    bit_met_m1_p3= _mm_subs_epi16(xmm1, ch_mag_over_2);

    xmm1 = _mm_subs_epi16(psi_a_m1_m1, y0_p_1_1);
    bit_met_m1_m1= _mm_subs_epi16(xmm1, ch_mag_over_10);

    xmm1 = _mm_subs_epi16(psi_a_m1_m3, y0_p_1_3);
    bit_met_m1_m3= _mm_subs_epi16(xmm1, ch_mag_over_2);

    xmm1 = _mm_subs_epi16(psi_a_m3_p1, y0_m_3_1);
    bit_met_m3_p1= _mm_subs_epi16(xmm1, ch_mag_over_2);

    xmm1 = _mm_subs_epi16(psi_a_m3_p3, y0_m_3_3);
    bit_met_m3_p3= _mm_subs_epi16(xmm1, ch_mag_9_over_10);

    xmm1 = _mm_subs_epi16(psi_a_m3_m1, y0_p_3_1);
    bit_met_m3_m1= _mm_subs_epi16(xmm1, ch_mag_over_2);

    xmm1 = _mm_subs_epi16(psi_a_m3_m3, y0_p_3_3);
    bit_met_m3_m3= _mm_subs_epi16(xmm1, ch_mag_9_over_10);

    // LLR of the first bit
    // Bit = 1
    xmm0 = _mm_max_epi16(bit_met_m1_p1,bit_met_m1_p3);
    xmm1 = _mm_max_epi16(bit_met_m1_m1,bit_met_m1_m3);
    xmm2 = _mm_max_epi16(bit_met_m3_p1,bit_met_m3_p3);
    xmm3 = _mm_max_epi16(bit_met_m3_m1,bit_met_m3_m3);
    xmm4 = _mm_max_epi16(xmm0,xmm1);
    xmm5 = _mm_max_epi16(xmm2,xmm3);
    logmax_num_re0= _mm_max_epi16(xmm4,xmm5);

    // Bit = 0
    xmm0 = _mm_max_epi16(bit_met_p1_p1,bit_met_p1_p3);
    xmm1 = _mm_max_epi16(bit_met_p1_m1,bit_met_p1_m3);
    xmm2 = _mm_max_epi16(bit_met_p3_p1,bit_met_p3_p3);
    xmm3 = _mm_max_epi16(bit_met_p3_m1,bit_met_p3_m3);
    xmm4 = _mm_max_epi16(xmm0,xmm1);
    xmm5 = _mm_max_epi16(xmm2,xmm3);
    logmax_den_re0 = _mm_max_epi16(xmm4,xmm5);

    // LLR of first bit [L1(1), L1(2), L1(3), L1(4), L1(5), L1(6), L1(7), L1(8)]
    y0r = _mm_subs_epi16(logmax_den_re0,logmax_num_re0);

    // LLR of the second bit
    // Bit = 1
    xmm0 = _mm_max_epi16(bit_met_p1_m1,bit_met_p3_m1);
    xmm1 = _mm_max_epi16(bit_met_m1_m1,bit_met_m3_m1);
    xmm2 = _mm_max_epi16(bit_met_p1_m3,bit_met_p3_m3);
    xmm3 = _mm_max_epi16(bit_met_m1_m3,bit_met_m3_m3);
    xmm4 = _mm_max_epi16(xmm0,xmm1);
    xmm5 = _mm_max_epi16(xmm2,xmm3);
    logmax_num_re1 = _mm_max_epi16(xmm4,xmm5);

    // Bit = 0
    xmm0 = _mm_max_epi16(bit_met_p1_p1,bit_met_p3_p1);
    xmm1 = _mm_max_epi16(bit_met_m1_p1,bit_met_m3_p1);
    xmm2 = _mm_max_epi16(bit_met_p1_p3,bit_met_p3_p3);
    xmm3 = _mm_max_epi16(bit_met_m1_p3,bit_met_m3_p3);
    xmm4 = _mm_max_epi16(xmm0,xmm1);
    xmm5 = _mm_max_epi16(xmm2,xmm3);
    logmax_den_re1 = _mm_max_epi16(xmm4,xmm5);

    // LLR of second bit [L2(1), L2(2), L2(3), L2(4)]
    y1r = _mm_subs_epi16(logmax_den_re1,logmax_num_re1);

    // LLR of the third bit
    // Bit = 1
    xmm0 = _mm_max_epi16(bit_met_m3_p1,bit_met_m3_p3);
    xmm1 = _mm_max_epi16(bit_met_m3_m1,bit_met_m3_m3);
    xmm2 = _mm_max_epi16(bit_met_p3_p1,bit_met_p3_p3);
    xmm3 = _mm_max_epi16(bit_met_p3_m1,bit_met_p3_m3);
    xmm4 = _mm_max_epi16(xmm0,xmm1);
    xmm5 = _mm_max_epi16(xmm2,xmm3);
    logmax_num_im0 = _mm_max_epi16(xmm4,xmm5);

    // Bit = 0
    xmm0 = _mm_max_epi16(bit_met_m1_p1,bit_met_m1_p3);
    xmm1 = _mm_max_epi16(bit_met_m1_m1,bit_met_m1_m3);
    xmm2 = _mm_max_epi16(bit_met_p1_p1,bit_met_p1_p3);
    xmm3 = _mm_max_epi16(bit_met_p1_m1,bit_met_p1_m3);
    xmm4 = _mm_max_epi16(xmm0,xmm1);
    xmm5 = _mm_max_epi16(xmm2,xmm3);
    logmax_den_im0 = _mm_max_epi16(xmm4,xmm5);

    // LLR of third bit [L3(1), L3(2), L3(3), L3(4)]
    y0i = _mm_subs_epi16(logmax_den_im0,logmax_num_im0);

    // LLR of the fourth bit
    // Bit = 1
    xmm0 = _mm_max_epi16(bit_met_p1_m3,bit_met_p3_m3);
    xmm1 = _mm_max_epi16(bit_met_m1_m3,bit_met_m3_m3);
    xmm2 = _mm_max_epi16(bit_met_p1_p3,bit_met_p3_p3);
    xmm3 = _mm_max_epi16(bit_met_m1_p3,bit_met_m3_p3);
    xmm4 = _mm_max_epi16(xmm0,xmm1);
    xmm5 = _mm_max_epi16(xmm2,xmm3);
    logmax_num_im1 = _mm_max_epi16(xmm4,xmm5);

    // Bit = 0
    xmm0 = _mm_max_epi16(bit_met_p1_m1,bit_met_p3_m1);
    xmm1 = _mm_max_epi16(bit_met_m1_m1,bit_met_m3_m1);
    xmm2 = _mm_max_epi16(bit_met_p1_p1,bit_met_p3_p1);
    xmm3 = _mm_max_epi16(bit_met_m1_p1,bit_met_m3_p1);
    xmm4 = _mm_max_epi16(xmm0,xmm1);
    xmm5 = _mm_max_epi16(xmm2,xmm3);
    logmax_den_im1 = _mm_max_epi16(xmm4,xmm5);

    // LLR of fourth bit [L4(1), L4(2), L4(3), L4(4)]
    y1i = _mm_subs_epi16(logmax_den_im1,logmax_num_im1);

    // Pack LLRs in output
    // [L1(1), L2(1), L1(2), L2(2), L1(3), L2(3), L1(4), L2(4)]
    xmm0 = _mm_unpacklo_epi16(y0r,y1r);
    // [L1(5), L2(5), L1(6), L2(6), L1(7), L2(7), L1(8), L2(8)]
    xmm1 = _mm_unpackhi_epi16(y0r,y1r);
    // [L3(1), L4(1), L3(2), L4(2), L3(3), L4(3), L3(4), L4(4)]
    xmm2 = _mm_unpacklo_epi16(y0i,y1i);
    // [L3(5), L4(5), L3(6), L4(6), L3(7), L4(7), L3(8), L4(8)]
    xmm3 = _mm_unpackhi_epi16(y0i,y1i);

    stream0_128i_out[2*i+0] = _mm_unpacklo_epi32(xmm0,xmm2); // 8LLRs, 2REs
    stream0_128i_out[2*i+1] = _mm_unpackhi_epi32(xmm0,xmm2);
    stream0_128i_out[2*i+2] = _mm_unpacklo_epi32(xmm1,xmm3);
    stream0_128i_out[2*i+3] = _mm_unpackhi_epi32(xmm1,xmm3);

#elif defined(__arm__)

#endif
  }

#if defined(__x86_64__) || defined(__i386__)
  _mm_empty();
  _m_empty();
#endif

}

int nr_dlsch_16qam_qpsk_llr(NR_DL_FRAME_PARMS *frame_parms,
                         int32_t **rxdataF_comp,
                         int32_t **rxdataF_comp_i,
                         int32_t **dl_ch_mag,   //|h_0|^2*(2/sqrt{10})
                         int32_t **rho_i,
                         int16_t *dlsch_llr,
                         uint8_t symbol,
                         uint8_t first_symbol_flag,
                         uint16_t nb_rb,
                         uint16_t pbch_pss_sss_adjust,
                         int16_t **llr16p)
{

  int16_t *rxF      = (int16_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
  int16_t *rxF_i    = (int16_t*)&rxdataF_comp_i[0][(symbol*frame_parms->N_RB_DL*12)];
  int16_t *ch_mag   = (int16_t*)&dl_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)];
  int16_t *rho      = (int16_t*)&rho_i[0][(symbol*frame_parms->N_RB_DL*12)];
  int16_t *llr16;
  int len;
  uint8_t symbol_mod = (symbol >= (7-frame_parms->Ncp))? (symbol-(7-frame_parms->Ncp)) : symbol;

  // first symbol has different structure due to more pilots
  if (first_symbol_flag == 1) {
    llr16 = (int16_t*)dlsch_llr;
  } else {
    llr16 = (int16_t*)(*llr16p);
  }

  AssertFatal(llr16!=NULL,"nr_dlsch_16qam_qpsk_llr: llr is null, symbol %d\n",symbol);

  if ((symbol_mod==0) || (symbol_mod==(4-frame_parms->Ncp))) {
    // if symbol has pilots
    if (frame_parms->nb_antenna_ports_gNB!=1)
      // in 2 antenna ports we have 8 REs per symbol per RB
      len = (nb_rb*8) - (2*pbch_pss_sss_adjust/3);
    else
      // for 1 antenna port we have 10 REs per symbol per RB
      len = (nb_rb*10) - (5*pbch_pss_sss_adjust/6);
  } else {
    // symbol has no pilots
    len = (nb_rb*12) - pbch_pss_sss_adjust;
  }

  // printf("symbol %d: qam16_llr, len %d (llr16 %p)\n",symbol,len,llr16);

  nr_qam16_qpsk((short *)rxF,
             (short *)rxF_i,
             (short *)ch_mag,
             (short *)llr16,
             (short *)rho,
             len);

  llr16 += (len<<2);
  *llr16p = (short *)llr16;

  return(0);
}

void nr_qam16_qam16(short *stream0_in,
                 short *stream1_in,
                 short *ch_mag,
                 short *ch_mag_i,
                 short *stream0_out,
                 short *rho01,
                 int length
     )
{

  /*
    Author: Sebastian Wagner
    Date: 2012-06-04

    Input:
    stream0_in:  MF filter for 1st stream, i.e., y0=h0'*y
    stream!_in:  MF filter for 2nd stream, i.e., y1=h1'*y
    ch_mag:      2*h0/sqrt(00), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
    ch_mag_i:    2*h1/sqrt(00), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
    rho01:       Channel cross correlation, i.e., h1'*h0

    Output:
    stream0_out: output LLRs for 1st stream
  */
#if defined(__x86_64__) || defined(__i386__)
  __m128i *rho01_128i       = (__m128i *)rho01;
  __m128i *stream0_128i_in  = (__m128i *)stream0_in;
  __m128i *stream1_128i_in  = (__m128i *)stream1_in;
  __m128i *stream0_128i_out = (__m128i *)stream0_out;
  __m128i *ch_mag_128i      = (__m128i *)ch_mag;
  __m128i *ch_mag_128i_i    = (__m128i *)ch_mag_i;



  __m128i ONE_OVER_SQRT_10 = _mm_set1_epi16(20724); // round(1/sqrt(10)*2^16)
  __m128i ONE_OVER_SQRT_10_Q15 = _mm_set1_epi16(10362); // round(1/sqrt(10)*2^15)
  __m128i THREE_OVER_SQRT_10 = _mm_set1_epi16(31086); // round(3/sqrt(10)*2^15)
  __m128i SQRT_10_OVER_FOUR = _mm_set1_epi16(25905); // round(sqrt(10)/4*2^15)
  __m128i ONE_OVER_TWO_SQRT_10 = _mm_set1_epi16(10362); // round(1/2/sqrt(10)*2^16)
  __m128i NINE_OVER_TWO_SQRT_10 = _mm_set1_epi16(23315); // round(9/2/sqrt(10)*2^14)
  __m128i ch_mag_des,ch_mag_int;
  __m128i  y0r_over_sqrt10;
  __m128i  y0i_over_sqrt10;
  __m128i  y0r_three_over_sqrt10;
  __m128i  y0i_three_over_sqrt10;
  __m128i ch_mag_over_10;
  __m128i ch_mag_over_2;
  __m128i ch_mag_9_over_10;
#elif defined(__arm__)

#endif

  int i;

  for (i=0; i<length>>2; i+=2) {
    // In one iteration, we deal with 8 REs

#if defined(__x86_64__) || defined(__i386__)
    // Get rho
    xmm0 = rho01_128i[i];
    xmm1 = rho01_128i[i+1];
    xmm0 = _mm_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    //xmm0 = [Re(0,1) Re(2,3) Im(0,1) Im(2,3)]
    //xmm1 = [Re(4,5) Re(6,7) Im(4,5) Im(6,7)]
    xmm2 = _mm_unpacklo_epi64(xmm0,xmm1); // Re(rho)
    xmm3 = _mm_unpackhi_epi64(xmm0,xmm1); // Im(rho)
    rho_rpi = _mm_adds_epi16(xmm2,xmm3); // rho = Re(rho) + Im(rho)
    rho_rmi = _mm_subs_epi16(xmm2,xmm3); // rho* = Re(rho) - Im(rho)

    // Compute the different rhos
    rho_rpi_1_1 = _mm_mulhi_epi16(rho_rpi,ONE_OVER_SQRT_10);
    rho_rmi_1_1 = _mm_mulhi_epi16(rho_rmi,ONE_OVER_SQRT_10);
    rho_rpi_3_3 = _mm_mulhi_epi16(rho_rpi,THREE_OVER_SQRT_10);
    rho_rmi_3_3 = _mm_mulhi_epi16(rho_rmi,THREE_OVER_SQRT_10);
    rho_rpi_3_3 = _mm_slli_epi16(rho_rpi_3_3,1);
    rho_rmi_3_3 = _mm_slli_epi16(rho_rmi_3_3,1);

    xmm4 = _mm_mulhi_epi16(xmm2,ONE_OVER_SQRT_10); // Re(rho)
    xmm5 = _mm_mulhi_epi16(xmm3,THREE_OVER_SQRT_10); // Im(rho)
    xmm5 = _mm_slli_epi16(xmm5,1);

    rho_rpi_1_3 = _mm_adds_epi16(xmm4,xmm5);
    rho_rmi_1_3 = _mm_subs_epi16(xmm4,xmm5);

    xmm6 = _mm_mulhi_epi16(xmm2,THREE_OVER_SQRT_10); // Re(rho)
    xmm7 = _mm_mulhi_epi16(xmm3,ONE_OVER_SQRT_10); // Im(rho)
    xmm6 = _mm_slli_epi16(xmm6,1);

    rho_rpi_3_1 = _mm_adds_epi16(xmm6,xmm7);
    rho_rmi_3_1 = _mm_subs_epi16(xmm6,xmm7);

    // Rearrange interfering MF output
    xmm0 = stream1_128i_in[i];
    xmm1 = stream1_128i_in[i+1];
    xmm0 = _mm_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    //xmm0 = [Re(0,1) Re(2,3) Im(0,1) Im(2,3)]
    //xmm1 = [Re(4,5) Re(6,7) Im(4,5) Im(6,7)]
    y1r = _mm_unpacklo_epi64(xmm0,xmm1); //[y1r(1),y1r(2),y1r(3),y1r(4)]
    y1i = _mm_unpackhi_epi64(xmm0,xmm1); //[y1i(1),y1i(2),y1i(3),y1i(4)]

    xmm0 = _mm_setzero_si128(); // ZERO
    xmm2 = _mm_subs_epi16(rho_rpi_1_1,y1r); // = [Re(rho)+ Im(rho)]/sqrt(10) - y1r
    psi_r_p1_p1 = _mm_abs_epi16(xmm2); // = |[Re(rho)+ Im(rho)]/sqrt(10) - y1r|

    xmm2= _mm_subs_epi16(rho_rmi_1_1,y1r);
    psi_r_p1_m1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rmi_1_1,y1i);
    psi_i_p1_p1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rpi_1_3,y1r);
    psi_r_p1_p3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rmi_1_3,y1r);
    psi_r_p1_m3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rmi_3_1,y1i);
    psi_i_p1_p3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rpi_3_1,y1r);
    psi_r_p3_p1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rmi_3_1,y1r);
    psi_r_p3_m1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rmi_1_3,y1i);
    psi_i_p3_p1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rpi_3_3,y1r);
    psi_r_p3_p3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rmi_3_3,y1r);
    psi_r_p3_m3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rmi_3_3,y1i);
    psi_i_p3_p3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rpi_1_1,y1i);
    psi_i_m1_p1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rpi_3_1,y1i);
    psi_i_m1_p3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rpi_1_3,y1i);
    psi_i_m3_p1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rpi_3_3,y1i);
    psi_i_m3_p3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(rho_rpi_1_1,y1i);
    psi_i_p1_m1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(rho_rpi_3_1,y1i);
    psi_i_p1_m3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(rho_rpi_1_3,y1i);
    psi_i_p3_m1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(rho_rpi_3_3,y1i);
    psi_i_p3_m3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(rho_rpi_1_1,y1r);
    psi_r_m1_m1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(rho_rpi_1_3,y1r);
    psi_r_m1_m3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(rho_rpi_3_1,y1r);
    psi_r_m3_m1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(rho_rpi_3_3,y1r);
    psi_r_m3_m3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(y1r,rho_rmi_1_1);
    psi_r_m1_p1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(y1r,rho_rmi_1_3);
    psi_r_m1_p3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(y1i,rho_rmi_1_1);
    psi_i_m1_m1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(y1i,rho_rmi_3_1);
    psi_i_m1_m3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(y1r,rho_rmi_3_1);
    psi_r_m3_p1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(y1r,rho_rmi_3_3);
    psi_r_m3_p3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(y1i,rho_rmi_1_3);
    psi_i_m3_m1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(y1i,rho_rmi_3_3);
    psi_i_m3_m3 = _mm_abs_epi16(xmm2);

    // Rearrange desired MF output
    xmm0 = stream0_128i_in[i];
    xmm1 = stream0_128i_in[i+1];
    xmm0 = _mm_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    //xmm0 = [Re(0,1) Re(2,3) Im(0,1) Im(2,3)]
    //xmm1 = [Re(4,5) Re(6,7) Im(4,5) Im(6,7)]
    y0r = _mm_unpacklo_epi64(xmm0,xmm1); // = [y0r(1),y0r(2),y0r(3),y0r(4)]
    y0i = _mm_unpackhi_epi64(xmm0,xmm1);

    // Rearrange desired channel magnitudes
    xmm2 = ch_mag_128i[i]; // = [|h|^2(1),|h|^2(1),|h|^2(2),|h|^2(2)]*(2/sqrt(10))
    xmm3 = ch_mag_128i[i+1]; // = [|h|^2(3),|h|^2(3),|h|^2(4),|h|^2(4)]*(2/sqrt(10))
    xmm2 = _mm_shufflelo_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm2 = _mm_shufflehi_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm2 = _mm_shuffle_epi32(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shufflelo_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shufflehi_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shuffle_epi32(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));

    ch_mag_des = _mm_unpacklo_epi64(xmm2,xmm3); // = [|h|^2(1),|h|^2(2),|h|^2(3),|h|^2(4)]*(2/sqrt(10))

    // Rearrange interfering channel magnitudes
    xmm2 = ch_mag_128i_i[i];
    xmm3 = ch_mag_128i_i[i+1];

    xmm2 = _mm_shufflelo_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm2 = _mm_shufflehi_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm2 = _mm_shuffle_epi32(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shufflelo_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shufflehi_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shuffle_epi32(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));

    ch_mag_int  = _mm_unpacklo_epi64(xmm2,xmm3);

    // Scale MF output of desired signal
    y0r_over_sqrt10 = _mm_mulhi_epi16(y0r,ONE_OVER_SQRT_10);
    y0i_over_sqrt10 = _mm_mulhi_epi16(y0i,ONE_OVER_SQRT_10);
    y0r_three_over_sqrt10 = _mm_mulhi_epi16(y0r,THREE_OVER_SQRT_10);
    y0i_three_over_sqrt10 = _mm_mulhi_epi16(y0i,THREE_OVER_SQRT_10);
    y0r_three_over_sqrt10 = _mm_slli_epi16(y0r_three_over_sqrt10,1);
    y0i_three_over_sqrt10 = _mm_slli_epi16(y0i_three_over_sqrt10,1);

    // Compute necessary combination of required terms
    y0_p_1_1 = _mm_adds_epi16(y0r_over_sqrt10,y0i_over_sqrt10);
    y0_m_1_1 = _mm_subs_epi16(y0r_over_sqrt10,y0i_over_sqrt10);

    y0_p_1_3 = _mm_adds_epi16(y0r_over_sqrt10,y0i_three_over_sqrt10);
    y0_m_1_3 = _mm_subs_epi16(y0r_over_sqrt10,y0i_three_over_sqrt10);

    y0_p_3_1 = _mm_adds_epi16(y0r_three_over_sqrt10,y0i_over_sqrt10);
    y0_m_3_1 = _mm_subs_epi16(y0r_three_over_sqrt10,y0i_over_sqrt10);

    y0_p_3_3 = _mm_adds_epi16(y0r_three_over_sqrt10,y0i_three_over_sqrt10);
    y0_m_3_3 = _mm_subs_epi16(y0r_three_over_sqrt10,y0i_three_over_sqrt10);

    // Compute optimal interfering symbol magnitude
    interference_abs_epi16(psi_r_p1_p1 ,ch_mag_int,a_r_p1_p1,ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p1_p1 ,ch_mag_int,a_i_p1_p1,ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p1_p3 ,ch_mag_int,a_r_p1_p3,ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p1_p3 ,ch_mag_int,a_i_p1_p3,ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p1_m1 ,ch_mag_int,a_r_p1_m1,ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p1_m1 ,ch_mag_int,a_i_p1_m1,ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p1_m3 ,ch_mag_int,a_r_p1_m3,ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p1_m3 ,ch_mag_int,a_i_p1_m3,ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p3_p1 ,ch_mag_int,a_r_p3_p1,ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p3_p1 ,ch_mag_int,a_i_p3_p1,ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p3_p3 ,ch_mag_int,a_r_p3_p3,ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p3_p3 ,ch_mag_int,a_i_p3_p3,ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p3_m1 ,ch_mag_int,a_r_p3_m1,ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p3_m1 ,ch_mag_int,a_i_p3_m1,ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p3_m3 ,ch_mag_int,a_r_p3_m3,ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p3_m3 ,ch_mag_int,a_i_p3_m3,ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m1_p1 ,ch_mag_int,a_r_m1_p1,ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m1_p1 ,ch_mag_int,a_i_m1_p1,ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m1_p3 ,ch_mag_int,a_r_m1_p3,ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m1_p3 ,ch_mag_int,a_i_m1_p3,ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m1_m1 ,ch_mag_int,a_r_m1_m1,ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m1_m1 ,ch_mag_int,a_i_m1_m1,ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m1_m3 ,ch_mag_int,a_r_m1_m3,ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m1_m3 ,ch_mag_int,a_i_m1_m3,ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m3_p1 ,ch_mag_int,a_r_m3_p1,ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m3_p1 ,ch_mag_int,a_i_m3_p1,ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m3_p3 ,ch_mag_int,a_r_m3_p3,ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m3_p3 ,ch_mag_int,a_i_m3_p3,ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m3_m1 ,ch_mag_int,a_r_m3_m1,ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m3_m1 ,ch_mag_int,a_i_m3_m1,ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m3_m3 ,ch_mag_int,a_r_m3_m3,ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m3_m3 ,ch_mag_int,a_i_m3_m3,ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);

    // Calculation of groups of two terms in the bit metric involving product of psi and interference magnitude
    prodsum_psi_a_epi16(psi_r_p1_p1,a_r_p1_p1,psi_i_p1_p1,a_i_p1_p1,psi_a_p1_p1);
    prodsum_psi_a_epi16(psi_r_p1_p3,a_r_p1_p3,psi_i_p1_p3,a_i_p1_p3,psi_a_p1_p3);
    prodsum_psi_a_epi16(psi_r_p3_p1,a_r_p3_p1,psi_i_p3_p1,a_i_p3_p1,psi_a_p3_p1);
    prodsum_psi_a_epi16(psi_r_p3_p3,a_r_p3_p3,psi_i_p3_p3,a_i_p3_p3,psi_a_p3_p3);
    prodsum_psi_a_epi16(psi_r_p1_m1,a_r_p1_m1,psi_i_p1_m1,a_i_p1_m1,psi_a_p1_m1);
    prodsum_psi_a_epi16(psi_r_p1_m3,a_r_p1_m3,psi_i_p1_m3,a_i_p1_m3,psi_a_p1_m3);
    prodsum_psi_a_epi16(psi_r_p3_m1,a_r_p3_m1,psi_i_p3_m1,a_i_p3_m1,psi_a_p3_m1);
    prodsum_psi_a_epi16(psi_r_p3_m3,a_r_p3_m3,psi_i_p3_m3,a_i_p3_m3,psi_a_p3_m3);
    prodsum_psi_a_epi16(psi_r_m1_p1,a_r_m1_p1,psi_i_m1_p1,a_i_m1_p1,psi_a_m1_p1);
    prodsum_psi_a_epi16(psi_r_m1_p3,a_r_m1_p3,psi_i_m1_p3,a_i_m1_p3,psi_a_m1_p3);
    prodsum_psi_a_epi16(psi_r_m3_p1,a_r_m3_p1,psi_i_m3_p1,a_i_m3_p1,psi_a_m3_p1);
    prodsum_psi_a_epi16(psi_r_m3_p3,a_r_m3_p3,psi_i_m3_p3,a_i_m3_p3,psi_a_m3_p3);
    prodsum_psi_a_epi16(psi_r_m1_m1,a_r_m1_m1,psi_i_m1_m1,a_i_m1_m1,psi_a_m1_m1);
    prodsum_psi_a_epi16(psi_r_m1_m3,a_r_m1_m3,psi_i_m1_m3,a_i_m1_m3,psi_a_m1_m3);
    prodsum_psi_a_epi16(psi_r_m3_m1,a_r_m3_m1,psi_i_m3_m1,a_i_m3_m1,psi_a_m3_m1);
    prodsum_psi_a_epi16(psi_r_m3_m3,a_r_m3_m3,psi_i_m3_m3,a_i_m3_m3,psi_a_m3_m3);


    // squared interference magnitude times int. ch. power
    square_a_epi16(a_r_p1_p1,a_i_p1_p1,ch_mag_int,SQRT_10_OVER_FOUR,a_sq_p1_p1);
    square_a_epi16(a_r_p1_p3,a_i_p1_p3,ch_mag_int,SQRT_10_OVER_FOUR,a_sq_p1_p3);
    square_a_epi16(a_r_p3_p1,a_i_p3_p1,ch_mag_int,SQRT_10_OVER_FOUR,a_sq_p3_p1);
    square_a_epi16(a_r_p3_p3,a_i_p3_p3,ch_mag_int,SQRT_10_OVER_FOUR,a_sq_p3_p3);
    square_a_epi16(a_r_p1_m1,a_i_p1_m1,ch_mag_int,SQRT_10_OVER_FOUR,a_sq_p1_m1);
    square_a_epi16(a_r_p1_m3,a_i_p1_m3,ch_mag_int,SQRT_10_OVER_FOUR,a_sq_p1_m3);
    square_a_epi16(a_r_p3_m1,a_i_p3_m1,ch_mag_int,SQRT_10_OVER_FOUR,a_sq_p3_m1);
    square_a_epi16(a_r_p3_m3,a_i_p3_m3,ch_mag_int,SQRT_10_OVER_FOUR,a_sq_p3_m3);
    square_a_epi16(a_r_m1_p1,a_i_m1_p1,ch_mag_int,SQRT_10_OVER_FOUR,a_sq_m1_p1);
    square_a_epi16(a_r_m1_p3,a_i_m1_p3,ch_mag_int,SQRT_10_OVER_FOUR,a_sq_m1_p3);
    square_a_epi16(a_r_m3_p1,a_i_m3_p1,ch_mag_int,SQRT_10_OVER_FOUR,a_sq_m3_p1);
    square_a_epi16(a_r_m3_p3,a_i_m3_p3,ch_mag_int,SQRT_10_OVER_FOUR,a_sq_m3_p3);
    square_a_epi16(a_r_m1_m1,a_i_m1_m1,ch_mag_int,SQRT_10_OVER_FOUR,a_sq_m1_m1);
    square_a_epi16(a_r_m1_m3,a_i_m1_m3,ch_mag_int,SQRT_10_OVER_FOUR,a_sq_m1_m3);
    square_a_epi16(a_r_m3_m1,a_i_m3_m1,ch_mag_int,SQRT_10_OVER_FOUR,a_sq_m3_m1);
    square_a_epi16(a_r_m3_m3,a_i_m3_m3,ch_mag_int,SQRT_10_OVER_FOUR,a_sq_m3_m3);

    // Computing different multiples of channel norms
    ch_mag_over_10=_mm_mulhi_epi16(ch_mag_des, ONE_OVER_TWO_SQRT_10);
    ch_mag_over_2=_mm_mulhi_epi16(ch_mag_des, SQRT_10_OVER_FOUR);
    ch_mag_over_2=_mm_slli_epi16(ch_mag_over_2, 1);
    ch_mag_9_over_10=_mm_mulhi_epi16(ch_mag_des, NINE_OVER_TWO_SQRT_10);
    ch_mag_9_over_10=_mm_slli_epi16(ch_mag_9_over_10, 2);

    // Computing Metrics
    xmm0 = _mm_subs_epi16(psi_a_p1_p1,a_sq_p1_p1);
    xmm1 = _mm_adds_epi16(xmm0,y0_p_1_1);
    bit_met_p1_p1= _mm_subs_epi16(xmm1,ch_mag_over_10);

    xmm0 = _mm_subs_epi16(psi_a_p1_p3,a_sq_p1_p3);
    xmm1 = _mm_adds_epi16(xmm0,y0_p_1_3);
    bit_met_p1_p3= _mm_subs_epi16(xmm1,ch_mag_over_2);

    xmm0 = _mm_subs_epi16(psi_a_p1_m1,a_sq_p1_m1);
    xmm1 = _mm_adds_epi16(xmm0,y0_m_1_1);
    bit_met_p1_m1= _mm_subs_epi16(xmm1,ch_mag_over_10);

    xmm0 = _mm_subs_epi16(psi_a_p1_m3,a_sq_p1_m3);
    xmm1 = _mm_adds_epi16(xmm0,y0_m_1_3);
    bit_met_p1_m3= _mm_subs_epi16(xmm1,ch_mag_over_2);

    xmm0 = _mm_subs_epi16(psi_a_p3_p1,a_sq_p3_p1);
    xmm1 = _mm_adds_epi16(xmm0,y0_p_3_1);
    bit_met_p3_p1= _mm_subs_epi16(xmm1,ch_mag_over_2);

    xmm0 = _mm_subs_epi16(psi_a_p3_p3,a_sq_p3_p3);
    xmm1 = _mm_adds_epi16(xmm0,y0_p_3_3);
    bit_met_p3_p3= _mm_subs_epi16(xmm1,ch_mag_9_over_10);

    xmm0 = _mm_subs_epi16(psi_a_p3_m1,a_sq_p3_m1);
    xmm1 = _mm_adds_epi16(xmm0,y0_m_3_1);
    bit_met_p3_m1= _mm_subs_epi16(xmm1,ch_mag_over_2);

    xmm0 = _mm_subs_epi16(psi_a_p3_m3,a_sq_p3_m3);
    xmm1 = _mm_adds_epi16(xmm0,y0_m_3_3);
    bit_met_p3_m3= _mm_subs_epi16(xmm1,ch_mag_9_over_10);

    xmm0 = _mm_subs_epi16(psi_a_m1_p1,a_sq_m1_p1);
    xmm1 = _mm_subs_epi16(xmm0,y0_m_1_1);
    bit_met_m1_p1= _mm_subs_epi16(xmm1,ch_mag_over_10);

    xmm0 = _mm_subs_epi16(psi_a_m1_p3,a_sq_m1_p3);
    xmm1 = _mm_subs_epi16(xmm0,y0_m_1_3);
    bit_met_m1_p3= _mm_subs_epi16(xmm1,ch_mag_over_2);

    xmm0 = _mm_subs_epi16(psi_a_m1_m1,a_sq_m1_m1);
    xmm1 = _mm_subs_epi16(xmm0,y0_p_1_1);
    bit_met_m1_m1= _mm_subs_epi16(xmm1,ch_mag_over_10);

    xmm0 = _mm_subs_epi16(psi_a_m1_m3,a_sq_m1_m3);
    xmm1 = _mm_subs_epi16(xmm0,y0_p_1_3);
    bit_met_m1_m3= _mm_subs_epi16(xmm1,ch_mag_over_2);

    xmm0 = _mm_subs_epi16(psi_a_m3_p1,a_sq_m3_p1);
    xmm1 = _mm_subs_epi16(xmm0,y0_m_3_1);
    bit_met_m3_p1= _mm_subs_epi16(xmm1,ch_mag_over_2);

    xmm0 = _mm_subs_epi16(psi_a_m3_p3,a_sq_m3_p3);
    xmm1 = _mm_subs_epi16(xmm0,y0_m_3_3);
    bit_met_m3_p3= _mm_subs_epi16(xmm1,ch_mag_9_over_10);

    xmm0 = _mm_subs_epi16(psi_a_m3_m1,a_sq_m3_m1);
    xmm1 = _mm_subs_epi16(xmm0,y0_p_3_1);
    bit_met_m3_m1= _mm_subs_epi16(xmm1,ch_mag_over_2);

    xmm0 = _mm_subs_epi16(psi_a_m3_m3,a_sq_m3_m3);
    xmm1 = _mm_subs_epi16(xmm0,y0_p_3_3);
    bit_met_m3_m3= _mm_subs_epi16(xmm1,ch_mag_9_over_10);

    // LLR of the first bit
    // Bit = 1
    xmm0 = _mm_max_epi16(bit_met_m1_p1,bit_met_m1_p3);
    xmm1 = _mm_max_epi16(bit_met_m1_m1,bit_met_m1_m3);
    xmm2 = _mm_max_epi16(bit_met_m3_p1,bit_met_m3_p3);
    xmm3 = _mm_max_epi16(bit_met_m3_m1,bit_met_m3_m3);
    xmm4 = _mm_max_epi16(xmm0,xmm1);
    xmm5 = _mm_max_epi16(xmm2,xmm3);
    logmax_num_re0= _mm_max_epi16(xmm4,xmm5);

    // Bit = 0
    xmm0 = _mm_max_epi16(bit_met_p1_p1,bit_met_p1_p3);
    xmm1 = _mm_max_epi16(bit_met_p1_m1,bit_met_p1_m3);
    xmm2 = _mm_max_epi16(bit_met_p3_p1,bit_met_p3_p3);
    xmm3 = _mm_max_epi16(bit_met_p3_m1,bit_met_p3_m3);
    xmm4 = _mm_max_epi16(xmm0,xmm1);
    xmm5 = _mm_max_epi16(xmm2,xmm3);
    logmax_den_re0 = _mm_max_epi16(xmm4,xmm5);

    // LLR of first bit [L1(1), L1(2), L1(3), L1(4), L1(5), L1(6), L1(7), L1(8)]
    y0r = _mm_subs_epi16(logmax_den_re0,logmax_num_re0);

    // LLR of the second bit
    // Bit = 1
    xmm0 = _mm_max_epi16(bit_met_p1_m1,bit_met_p3_m1);
    xmm1 = _mm_max_epi16(bit_met_m1_m1,bit_met_m3_m1);
    xmm2 = _mm_max_epi16(bit_met_p1_m3,bit_met_p3_m3);
    xmm3 = _mm_max_epi16(bit_met_m1_m3,bit_met_m3_m3);
    xmm4 = _mm_max_epi16(xmm0,xmm1);
    xmm5 = _mm_max_epi16(xmm2,xmm3);
    logmax_num_re1 = _mm_max_epi16(xmm4,xmm5);

    // Bit = 0
    xmm0 = _mm_max_epi16(bit_met_p1_p1,bit_met_p3_p1);
    xmm1 = _mm_max_epi16(bit_met_m1_p1,bit_met_m3_p1);
    xmm2 = _mm_max_epi16(bit_met_p1_p3,bit_met_p3_p3);
    xmm3 = _mm_max_epi16(bit_met_m1_p3,bit_met_m3_p3);
    xmm4 = _mm_max_epi16(xmm0,xmm1);
    xmm5 = _mm_max_epi16(xmm2,xmm3);
    logmax_den_re1 = _mm_max_epi16(xmm4,xmm5);

    // LLR of second bit [L2(1), L2(2), L2(3), L2(4)]
    y1r = _mm_subs_epi16(logmax_den_re1,logmax_num_re1);

    // LLR of the third bit
    // Bit = 1
    xmm0 = _mm_max_epi16(bit_met_m3_p1,bit_met_m3_p3);
    xmm1 = _mm_max_epi16(bit_met_m3_m1,bit_met_m3_m3);
    xmm2 = _mm_max_epi16(bit_met_p3_p1,bit_met_p3_p3);
    xmm3 = _mm_max_epi16(bit_met_p3_m1,bit_met_p3_m3);
    xmm4 = _mm_max_epi16(xmm0,xmm1);
    xmm5 = _mm_max_epi16(xmm2,xmm3);
    logmax_num_im0 = _mm_max_epi16(xmm4,xmm5);

    // Bit = 0
    xmm0 = _mm_max_epi16(bit_met_m1_p1,bit_met_m1_p3);
    xmm1 = _mm_max_epi16(bit_met_m1_m1,bit_met_m1_m3);
    xmm2 = _mm_max_epi16(bit_met_p1_p1,bit_met_p1_p3);
    xmm3 = _mm_max_epi16(bit_met_p1_m1,bit_met_p1_m3);
    xmm4 = _mm_max_epi16(xmm0,xmm1);
    xmm5 = _mm_max_epi16(xmm2,xmm3);
    logmax_den_im0 = _mm_max_epi16(xmm4,xmm5);

    // LLR of third bit [L3(1), L3(2), L3(3), L3(4)]
    y0i = _mm_subs_epi16(logmax_den_im0,logmax_num_im0);

    // LLR of the fourth bit
    // Bit = 1
    xmm0 = _mm_max_epi16(bit_met_p1_m3,bit_met_p3_m3);
    xmm1 = _mm_max_epi16(bit_met_m1_m3,bit_met_m3_m3);
    xmm2 = _mm_max_epi16(bit_met_p1_p3,bit_met_p3_p3);
    xmm3 = _mm_max_epi16(bit_met_m1_p3,bit_met_m3_p3);
    xmm4 = _mm_max_epi16(xmm0,xmm1);
    xmm5 = _mm_max_epi16(xmm2,xmm3);
    logmax_num_im1 = _mm_max_epi16(xmm4,xmm5);

    // Bit = 0
    xmm0 = _mm_max_epi16(bit_met_p1_m1,bit_met_p3_m1);
    xmm1 = _mm_max_epi16(bit_met_m1_m1,bit_met_m3_m1);
    xmm2 = _mm_max_epi16(bit_met_p1_p1,bit_met_p3_p1);
    xmm3 = _mm_max_epi16(bit_met_m1_p1,bit_met_m3_p1);
    xmm4 = _mm_max_epi16(xmm0,xmm1);
    xmm5 = _mm_max_epi16(xmm2,xmm3);
    logmax_den_im1 = _mm_max_epi16(xmm4,xmm5);

    // LLR of fourth bit [L4(1), L4(2), L4(3), L4(4)]
    y1i = _mm_subs_epi16(logmax_den_im1,logmax_num_im1);

    // Pack LLRs in output
    // [L1(1), L2(1), L1(2), L2(2), L1(3), L2(3), L1(4), L2(4)]
    xmm0 = _mm_unpacklo_epi16(y0r,y1r);
    // [L1(5), L2(5), L1(6), L2(6), L1(7), L2(7), L1(8), L2(8)]
    xmm1 = _mm_unpackhi_epi16(y0r,y1r);
    // [L3(1), L4(1), L3(2), L4(2), L3(3), L4(3), L3(4), L4(4)]
    xmm2 = _mm_unpacklo_epi16(y0i,y1i);
    // [L3(5), L4(5), L3(6), L4(6), L3(7), L4(7), L3(8), L4(8)]
    xmm3 = _mm_unpackhi_epi16(y0i,y1i);

    stream0_128i_out[2*i+0] = _mm_unpacklo_epi32(xmm0,xmm2); // 8LLRs, 2REs
    stream0_128i_out[2*i+1] = _mm_unpackhi_epi32(xmm0,xmm2);
    stream0_128i_out[2*i+2] = _mm_unpacklo_epi32(xmm1,xmm3);
    stream0_128i_out[2*i+3] = _mm_unpackhi_epi32(xmm1,xmm3);
#elif defined(__arm__)

#endif

  }

#if defined(__x86_64__) || defined(__i386__)
  _mm_empty();
  _m_empty();
#endif
}

int nr_dlsch_16qam_16qam_llr(NR_DL_FRAME_PARMS *frame_parms,
                          int32_t **rxdataF_comp,
                          int32_t **rxdataF_comp_i,
                          int32_t **dl_ch_mag,   //|h_0|^2*(2/sqrt{10})
                          int32_t **dl_ch_mag_i, //|h_1|^2*(2/sqrt{10})
                          int32_t **rho_i,
                          int16_t *dlsch_llr,
                          uint8_t symbol,
						  uint32_t len,
                          uint8_t first_symbol_flag,
                          uint16_t nb_rb,
                          uint16_t pbch_pss_sss_adjust,
                          int16_t **llr16p)
{

  int16_t *rxF      = (int16_t*)&rxdataF_comp[0][(symbol*nb_rb*12)];
  int16_t *rxF_i    = (int16_t*)&rxdataF_comp_i[0][(symbol*nb_rb*12)];
  int16_t *ch_mag   = (int16_t*)&dl_ch_mag[0][(symbol*nb_rb*12)];
  int16_t *ch_mag_i = (int16_t*)&dl_ch_mag_i[0][(symbol*nb_rb*12)];
  int16_t *rho      = (int16_t*)&rho_i[0][(symbol*nb_rb*12)];
  int16_t *llr16;

  // first symbol has different structure due to more pilots
  if (first_symbol_flag == 1) {
    llr16 = (int16_t*)dlsch_llr;
  } else {
    llr16 = (int16_t*)(*llr16p);
  }


  AssertFatal(llr16!=NULL,"nr_dlsch_16qam_16qam_llr: llr is null, symbol %d\n",symbol);

  // printf("symbol %d: qam16_llr, len %d (llr16 %p)\n",symbol,len,llr16);

  nr_qam16_qam16((short *)rxF,
              (short *)rxF_i,
              (short *)ch_mag,
              (short *)ch_mag_i,
              (short *)llr16,
              (short *)rho,
              len);

  llr16 += (len<<2);
  *llr16p = (short *)llr16;

  return(0);
}

void nr_qam16_qam64(int16_t *stream0_in,
                 int16_t *stream1_in,
                 int16_t *ch_mag,
                 int16_t *ch_mag_i,
                 int16_t *stream0_out,
                 int16_t *rho01,
                 int32_t length
     )
{

  /*
    Author: Sebastian Wagner
    Date: 2012-06-04

    Input:
    stream0_in:  MF filter for 1st stream, i.e., y0=h0'*y
    stream!_in:  MF filter for 2nd stream, i.e., y1=h1'*y
    ch_mag:      2*h0/sqrt(00), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
    ch_mag_i:    2*h1/sqrt(00), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
    rho01:       Channel cross correlation, i.e., h1'*h0

    Output:
    stream0_out: output LLRs for 1st stream
  */

#if defined(__x86_64__) || defined(__i386__)
  __m128i *rho01_128i       = (__m128i *)rho01;
  __m128i *stream0_128i_in  = (__m128i *)stream0_in;
  __m128i *stream1_128i_in  = (__m128i *)stream1_in;
  __m128i *stream0_128i_out = (__m128i *)stream0_out;
  __m128i *ch_mag_128i      = (__m128i *)ch_mag;
  __m128i *ch_mag_128i_i    = (__m128i *)ch_mag_i;


  __m128i ONE_OVER_SQRT_2 = _mm_set1_epi16(23170); // round(1/sqrt(2)*2^15)
  __m128i ONE_OVER_SQRT_10 = _mm_set1_epi16(20724); // round(1/sqrt(10)*2^16)
  __m128i THREE_OVER_SQRT_10 = _mm_set1_epi16(31086); // round(3/sqrt(10)*2^15)
  __m128i SQRT_10_OVER_FOUR = _mm_set1_epi16(25905); // round(sqrt(10)/4*2^15)
  __m128i ONE_OVER_TWO_SQRT_10 = _mm_set1_epi16(10362); // round(1/2/sqrt(10)*2^16)
  __m128i NINE_OVER_TWO_SQRT_10 = _mm_set1_epi16(23315); // round(9/2/sqrt(10)*2^14)
  __m128i ONE_OVER_SQRT_2_42 = _mm_set1_epi16(3575); // round(1/sqrt(2*42)*2^15)
  __m128i THREE_OVER_SQRT_2_42 = _mm_set1_epi16(10726); // round(3/sqrt(2*42)*2^15)
  __m128i FIVE_OVER_SQRT_2_42 = _mm_set1_epi16(17876); // round(5/sqrt(2*42)*2^15)
  __m128i SEVEN_OVER_SQRT_2_42 = _mm_set1_epi16(25027); // round(7/sqrt(2*42)*2^15)
  __m128i SQRT_42_OVER_FOUR = _mm_set1_epi16(13272); // round(sqrt(42)/4*2^13), Q3.
  __m128i ch_mag_des,ch_mag_int;
  __m128i  y0r_over_sqrt10;
  __m128i  y0i_over_sqrt10;
  __m128i  y0r_three_over_sqrt10;
  __m128i  y0i_three_over_sqrt10;
  __m128i ch_mag_over_10;
  __m128i ch_mag_over_2;
  __m128i ch_mag_9_over_10;
  __m128i ch_mag_int_with_sigma2;
  __m128i two_ch_mag_int_with_sigma2;
  __m128i three_ch_mag_int_with_sigma2;

#elif defined(__arm__)

#endif
  int i;

  for (i=0; i<length>>2; i+=2) {
    // In one iteration, we deal with 8 REs

#if defined(__x86_64__) || defined(__i386__)
    // Get rho
    xmm0 = rho01_128i[i];
    xmm1 = rho01_128i[i+1];
    xmm0 = _mm_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    //xmm0 = [Re(0,1) Re(2,3) Im(0,1) Im(2,3)]
    //xmm1 = [Re(4,5) Re(6,7) Im(4,5) Im(6,7)]
    xmm2 = _mm_unpacklo_epi64(xmm0,xmm1); // Re(rho)
    xmm3 = _mm_unpackhi_epi64(xmm0,xmm1); // Im(rho)
    rho_rpi = _mm_adds_epi16(xmm2,xmm3); // rho = Re(rho) + Im(rho)
    rho_rmi = _mm_subs_epi16(xmm2,xmm3); // rho* = Re(rho) - Im(rho)

    // Compute the different rhos
    rho_rpi_1_1 = _mm_mulhi_epi16(rho_rpi,ONE_OVER_SQRT_10);
    rho_rmi_1_1 = _mm_mulhi_epi16(rho_rmi,ONE_OVER_SQRT_10);
    rho_rpi_3_3 = _mm_mulhi_epi16(rho_rpi,THREE_OVER_SQRT_10);
    rho_rmi_3_3 = _mm_mulhi_epi16(rho_rmi,THREE_OVER_SQRT_10);
    rho_rpi_3_3 = _mm_slli_epi16(rho_rpi_3_3,1);
    rho_rmi_3_3 = _mm_slli_epi16(rho_rmi_3_3,1);

    xmm4 = _mm_mulhi_epi16(xmm2,ONE_OVER_SQRT_10); // Re(rho)
    xmm5 = _mm_mulhi_epi16(xmm3,THREE_OVER_SQRT_10); // Im(rho)
    xmm5 = _mm_slli_epi16(xmm5,1);

    rho_rpi_1_3 = _mm_adds_epi16(xmm4,xmm5);
    rho_rmi_1_3 = _mm_subs_epi16(xmm4,xmm5);

    xmm6 = _mm_mulhi_epi16(xmm2,THREE_OVER_SQRT_10); // Re(rho)
    xmm7 = _mm_mulhi_epi16(xmm3,ONE_OVER_SQRT_10); // Im(rho)
    xmm6 = _mm_slli_epi16(xmm6,1);

    rho_rpi_3_1 = _mm_adds_epi16(xmm6,xmm7);
    rho_rmi_3_1 = _mm_subs_epi16(xmm6,xmm7);

    // Rearrange interfering MF output
    xmm0 = stream1_128i_in[i];
    xmm1 = stream1_128i_in[i+1];
    xmm0 = _mm_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    //xmm0 = [Re(0,1) Re(2,3) Im(0,1) Im(2,3)]
    //xmm1 = [Re(4,5) Re(6,7) Im(4,5) Im(6,7)]
    y1r = _mm_unpacklo_epi64(xmm0,xmm1); //[y1r(1),y1r(2),y1r(3),y1r(4)]
    y1i = _mm_unpackhi_epi64(xmm0,xmm1); //[y1i(1),y1i(2),y1i(3),y1i(4)]

    xmm0 = _mm_setzero_si128(); // ZERO
    xmm2 = _mm_subs_epi16(rho_rpi_1_1,y1r); // = [Re(rho)+ Im(rho)]/sqrt(10) - y1r
    psi_r_p1_p1 = _mm_abs_epi16(xmm2); // = |[Re(rho)+ Im(rho)]/sqrt(10) - y1r|

    xmm2= _mm_subs_epi16(rho_rmi_1_1,y1r);
    psi_r_p1_m1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rmi_1_1,y1i);
    psi_i_p1_p1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rpi_1_3,y1r);
    psi_r_p1_p3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rmi_1_3,y1r);
    psi_r_p1_m3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rmi_3_1,y1i);
    psi_i_p1_p3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rpi_3_1,y1r);
    psi_r_p3_p1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rmi_3_1,y1r);
    psi_r_p3_m1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rmi_1_3,y1i);
    psi_i_p3_p1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rpi_3_3,y1r);
    psi_r_p3_p3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rmi_3_3,y1r);
    psi_r_p3_m3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rmi_3_3,y1i);
    psi_i_p3_p3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rpi_1_1,y1i);
    psi_i_m1_p1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rpi_3_1,y1i);
    psi_i_m1_p3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rpi_1_3,y1i);
    psi_i_m3_p1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_subs_epi16(rho_rpi_3_3,y1i);
    psi_i_m3_p3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(rho_rpi_1_1,y1i);
    psi_i_p1_m1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(rho_rpi_3_1,y1i);
    psi_i_p1_m3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(rho_rpi_1_3,y1i);
    psi_i_p3_m1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(rho_rpi_3_3,y1i);
    psi_i_p3_m3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(rho_rpi_1_1,y1r);
    psi_r_m1_m1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(rho_rpi_1_3,y1r);
    psi_r_m1_m3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(rho_rpi_3_1,y1r);
    psi_r_m3_m1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(rho_rpi_3_3,y1r);
    psi_r_m3_m3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(y1r,rho_rmi_1_1);
    psi_r_m1_p1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(y1r,rho_rmi_1_3);
    psi_r_m1_p3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(y1i,rho_rmi_1_1);
    psi_i_m1_m1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(y1i,rho_rmi_3_1);
    psi_i_m1_m3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(y1r,rho_rmi_3_1);
    psi_r_m3_p1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(y1r,rho_rmi_3_3);
    psi_r_m3_p3 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(y1i,rho_rmi_1_3);
    psi_i_m3_m1 = _mm_abs_epi16(xmm2);
    xmm2= _mm_adds_epi16(y1i,rho_rmi_3_3);
    psi_i_m3_m3 = _mm_abs_epi16(xmm2);

    // Rearrange desired MF output
    xmm0 = stream0_128i_in[i];
    xmm1 = stream0_128i_in[i+1];
    xmm0 = _mm_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    //xmm0 = [Re(0,1) Re(2,3) Im(0,1) Im(2,3)]
    //xmm1 = [Re(4,5) Re(6,7) Im(4,5) Im(6,7)]
    y0r = _mm_unpacklo_epi64(xmm0,xmm1); // = [y0r(1),y0r(2),y0r(3),y0r(4)]
    y0i = _mm_unpackhi_epi64(xmm0,xmm1);

    // Rearrange desired channel magnitudes
    xmm2 = ch_mag_128i[i]; // = [|h|^2(1),|h|^2(1),|h|^2(2),|h|^2(2)]*(2/sqrt(10))
    xmm3 = ch_mag_128i[i+1]; // = [|h|^2(3),|h|^2(3),|h|^2(4),|h|^2(4)]*(2/sqrt(10))
    xmm2 = _mm_shufflelo_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm2 = _mm_shufflehi_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm2 = _mm_shuffle_epi32(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shufflelo_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shufflehi_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shuffle_epi32(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));

    ch_mag_des = _mm_unpacklo_epi64(xmm2,xmm3); // = [|h|^2(1),|h|^2(2),|h|^2(3),|h|^2(4)]*(2/sqrt(10))

    // Rearrange interfering channel magnitudes
    xmm2 = ch_mag_128i_i[i];
    xmm3 = ch_mag_128i_i[i+1];

    xmm2 = _mm_shufflelo_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm2 = _mm_shufflehi_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm2 = _mm_shuffle_epi32(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shufflelo_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shufflehi_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shuffle_epi32(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));

    ch_mag_int  = _mm_unpacklo_epi64(xmm2,xmm3);

    // Scale MF output of desired signal
    y0r_over_sqrt10 = _mm_mulhi_epi16(y0r,ONE_OVER_SQRT_10);
    y0i_over_sqrt10 = _mm_mulhi_epi16(y0i,ONE_OVER_SQRT_10);
    y0r_three_over_sqrt10 = _mm_mulhi_epi16(y0r,THREE_OVER_SQRT_10);
    y0i_three_over_sqrt10 = _mm_mulhi_epi16(y0i,THREE_OVER_SQRT_10);
    y0r_three_over_sqrt10 = _mm_slli_epi16(y0r_three_over_sqrt10,1);
    y0i_three_over_sqrt10 = _mm_slli_epi16(y0i_three_over_sqrt10,1);

    // Compute necessary combination of required terms
    y0_p_1_1 = _mm_adds_epi16(y0r_over_sqrt10,y0i_over_sqrt10);
    y0_m_1_1 = _mm_subs_epi16(y0r_over_sqrt10,y0i_over_sqrt10);

    y0_p_1_3 = _mm_adds_epi16(y0r_over_sqrt10,y0i_three_over_sqrt10);
    y0_m_1_3 = _mm_subs_epi16(y0r_over_sqrt10,y0i_three_over_sqrt10);

    y0_p_3_1 = _mm_adds_epi16(y0r_three_over_sqrt10,y0i_over_sqrt10);
    y0_m_3_1 = _mm_subs_epi16(y0r_three_over_sqrt10,y0i_over_sqrt10);

    y0_p_3_3 = _mm_adds_epi16(y0r_three_over_sqrt10,y0i_three_over_sqrt10);
    y0_m_3_3 = _mm_subs_epi16(y0r_three_over_sqrt10,y0i_three_over_sqrt10);

    // Compute optimal interfering symbol magnitude
    ch_mag_int_with_sigma2       = _mm_srai_epi16(ch_mag_int, 1); // *2
    two_ch_mag_int_with_sigma2   = ch_mag_int; // *4
    three_ch_mag_int_with_sigma2 = _mm_adds_epi16(ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2); // *6

    interference_abs_64qam_epi16(psi_r_p1_p1 ,ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p1_p1,ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42,FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p1_p1 ,ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p1_p1,ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42,FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p1_p3 ,ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p1_p3,ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42,FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p1_p3 ,ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p1_p3,ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42,FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p1_m1 ,ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p1_m1,ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42,FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p1_m1 ,ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p1_m1,ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42,FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p1_m3 ,ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p1_m3,ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42,FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p1_m3 ,ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p1_m3,ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42,FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p3_p1 ,ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p3_p1,ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42,FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p3_p1 ,ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p3_p1,ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42,FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p3_p3 ,ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p3_p3,ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42,FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p3_p3 ,ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p3_p3,ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42,FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p3_m1 ,ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p3_m1,ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42,FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p3_m1 ,ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p3_m1,ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42,FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p3_m3 ,ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p3_m3,ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42,FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p3_m3 ,ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p3_m3,ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42,FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m1_p1 ,ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m1_p1,ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42,FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m1_p1 ,ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m1_p1,ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42,FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m1_p3 ,ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m1_p3,ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42,FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m1_p3 ,ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m1_p3,ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42,FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m1_m1 ,ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m1_m1,ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42,FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m1_m1 ,ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m1_m1,ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42,FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m1_m3 ,ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m1_m3,ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42,FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m1_m3 ,ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m1_m3,ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42,FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m3_p1 ,ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m3_p1,ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42,FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m3_p1 ,ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m3_p1,ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42,FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m3_p3 ,ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m3_p3,ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42,FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m3_p3 ,ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m3_p3,ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42,FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m3_m1 ,ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m3_m1,ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42,FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m3_m1 ,ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m3_m1,ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42,FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m3_m3 ,ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m3_m3,ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42,FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m3_m3 ,ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m3_m3,ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42,FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);

    // Calculation of groups of two terms in the bit metric involving product of psi and interference magnitude
    prodsum_psi_a_epi16(psi_r_p1_p1,a_r_p1_p1,psi_i_p1_p1,a_i_p1_p1,psi_a_p1_p1);
    prodsum_psi_a_epi16(psi_r_p1_p3,a_r_p1_p3,psi_i_p1_p3,a_i_p1_p3,psi_a_p1_p3);
    prodsum_psi_a_epi16(psi_r_p3_p1,a_r_p3_p1,psi_i_p3_p1,a_i_p3_p1,psi_a_p3_p1);
    prodsum_psi_a_epi16(psi_r_p3_p3,a_r_p3_p3,psi_i_p3_p3,a_i_p3_p3,psi_a_p3_p3);
    prodsum_psi_a_epi16(psi_r_p1_m1,a_r_p1_m1,psi_i_p1_m1,a_i_p1_m1,psi_a_p1_m1);
    prodsum_psi_a_epi16(psi_r_p1_m3,a_r_p1_m3,psi_i_p1_m3,a_i_p1_m3,psi_a_p1_m3);
    prodsum_psi_a_epi16(psi_r_p3_m1,a_r_p3_m1,psi_i_p3_m1,a_i_p3_m1,psi_a_p3_m1);
    prodsum_psi_a_epi16(psi_r_p3_m3,a_r_p3_m3,psi_i_p3_m3,a_i_p3_m3,psi_a_p3_m3);
    prodsum_psi_a_epi16(psi_r_m1_p1,a_r_m1_p1,psi_i_m1_p1,a_i_m1_p1,psi_a_m1_p1);
    prodsum_psi_a_epi16(psi_r_m1_p3,a_r_m1_p3,psi_i_m1_p3,a_i_m1_p3,psi_a_m1_p3);
    prodsum_psi_a_epi16(psi_r_m3_p1,a_r_m3_p1,psi_i_m3_p1,a_i_m3_p1,psi_a_m3_p1);
    prodsum_psi_a_epi16(psi_r_m3_p3,a_r_m3_p3,psi_i_m3_p3,a_i_m3_p3,psi_a_m3_p3);
    prodsum_psi_a_epi16(psi_r_m1_m1,a_r_m1_m1,psi_i_m1_m1,a_i_m1_m1,psi_a_m1_m1);
    prodsum_psi_a_epi16(psi_r_m1_m3,a_r_m1_m3,psi_i_m1_m3,a_i_m1_m3,psi_a_m1_m3);
    prodsum_psi_a_epi16(psi_r_m3_m1,a_r_m3_m1,psi_i_m3_m1,a_i_m3_m1,psi_a_m3_m1);
    prodsum_psi_a_epi16(psi_r_m3_m3,a_r_m3_m3,psi_i_m3_m3,a_i_m3_m3,psi_a_m3_m3);

    // Multiply by sqrt(2)
    psi_a_p1_p1 = _mm_mulhi_epi16(psi_a_p1_p1, ONE_OVER_SQRT_2);
    psi_a_p1_p1 = _mm_slli_epi16(psi_a_p1_p1, 2);
    psi_a_p1_p3 = _mm_mulhi_epi16(psi_a_p1_p3, ONE_OVER_SQRT_2);
    psi_a_p1_p3 = _mm_slli_epi16(psi_a_p1_p3, 2);
    psi_a_p3_p1 = _mm_mulhi_epi16(psi_a_p3_p1, ONE_OVER_SQRT_2);
    psi_a_p3_p1 = _mm_slli_epi16(psi_a_p3_p1, 2);
    psi_a_p3_p3 = _mm_mulhi_epi16(psi_a_p3_p3, ONE_OVER_SQRT_2);
    psi_a_p3_p3 = _mm_slli_epi16(psi_a_p3_p3, 2);
    psi_a_p1_m1 = _mm_mulhi_epi16(psi_a_p1_m1, ONE_OVER_SQRT_2);
    psi_a_p1_m1 = _mm_slli_epi16(psi_a_p1_m1, 2);
    psi_a_p1_m3 = _mm_mulhi_epi16(psi_a_p1_m3, ONE_OVER_SQRT_2);
    psi_a_p1_m3 = _mm_slli_epi16(psi_a_p1_m3, 2);
    psi_a_p3_m1 = _mm_mulhi_epi16(psi_a_p3_m1, ONE_OVER_SQRT_2);
    psi_a_p3_m1 = _mm_slli_epi16(psi_a_p3_m1, 2);
    psi_a_p3_m3 = _mm_mulhi_epi16(psi_a_p3_m3, ONE_OVER_SQRT_2);
    psi_a_p3_m3 = _mm_slli_epi16(psi_a_p3_m3, 2);
    psi_a_m1_p1 = _mm_mulhi_epi16(psi_a_m1_p1, ONE_OVER_SQRT_2);
    psi_a_m1_p1 = _mm_slli_epi16(psi_a_m1_p1, 2);
    psi_a_m1_p3 = _mm_mulhi_epi16(psi_a_m1_p3, ONE_OVER_SQRT_2);
    psi_a_m1_p3 = _mm_slli_epi16(psi_a_m1_p3, 2);
    psi_a_m3_p1 = _mm_mulhi_epi16(psi_a_m3_p1, ONE_OVER_SQRT_2);
    psi_a_m3_p1 = _mm_slli_epi16(psi_a_m3_p1, 2);
    psi_a_m3_p3 = _mm_mulhi_epi16(psi_a_m3_p3, ONE_OVER_SQRT_2);
    psi_a_m3_p3 = _mm_slli_epi16(psi_a_m3_p3, 2);
    psi_a_m1_m1 = _mm_mulhi_epi16(psi_a_m1_m1, ONE_OVER_SQRT_2);
    psi_a_m1_m1 = _mm_slli_epi16(psi_a_m1_m1, 2);
    psi_a_m1_m3 = _mm_mulhi_epi16(psi_a_m1_m3, ONE_OVER_SQRT_2);
    psi_a_m1_m3 = _mm_slli_epi16(psi_a_m1_m3, 2);
    psi_a_m3_m1 = _mm_mulhi_epi16(psi_a_m3_m1, ONE_OVER_SQRT_2);
    psi_a_m3_m1 = _mm_slli_epi16(psi_a_m3_m1, 2);
    psi_a_m3_m3 = _mm_mulhi_epi16(psi_a_m3_m3, ONE_OVER_SQRT_2);
    psi_a_m3_m3 = _mm_slli_epi16(psi_a_m3_m3, 2);

    // squared interference magnitude times int. ch. power
    square_a_64qam_epi16(a_r_p1_p1,a_i_p1_p1,ch_mag_int,SQRT_42_OVER_FOUR,a_sq_p1_p1);
    square_a_64qam_epi16(a_r_p1_p3,a_i_p1_p3,ch_mag_int,SQRT_42_OVER_FOUR,a_sq_p1_p3);
    square_a_64qam_epi16(a_r_p3_p1,a_i_p3_p1,ch_mag_int,SQRT_42_OVER_FOUR,a_sq_p3_p1);
    square_a_64qam_epi16(a_r_p3_p3,a_i_p3_p3,ch_mag_int,SQRT_42_OVER_FOUR,a_sq_p3_p3);
    square_a_64qam_epi16(a_r_p1_m1,a_i_p1_m1,ch_mag_int,SQRT_42_OVER_FOUR,a_sq_p1_m1);
    square_a_64qam_epi16(a_r_p1_m3,a_i_p1_m3,ch_mag_int,SQRT_42_OVER_FOUR,a_sq_p1_m3);
    square_a_64qam_epi16(a_r_p3_m1,a_i_p3_m1,ch_mag_int,SQRT_42_OVER_FOUR,a_sq_p3_m1);
    square_a_64qam_epi16(a_r_p3_m3,a_i_p3_m3,ch_mag_int,SQRT_42_OVER_FOUR,a_sq_p3_m3);
    square_a_64qam_epi16(a_r_m1_p1,a_i_m1_p1,ch_mag_int,SQRT_42_OVER_FOUR,a_sq_m1_p1);
    square_a_64qam_epi16(a_r_m1_p3,a_i_m1_p3,ch_mag_int,SQRT_42_OVER_FOUR,a_sq_m1_p3);
    square_a_64qam_epi16(a_r_m3_p1,a_i_m3_p1,ch_mag_int,SQRT_42_OVER_FOUR,a_sq_m3_p1);
    square_a_64qam_epi16(a_r_m3_p3,a_i_m3_p3,ch_mag_int,SQRT_42_OVER_FOUR,a_sq_m3_p3);
    square_a_64qam_epi16(a_r_m1_m1,a_i_m1_m1,ch_mag_int,SQRT_42_OVER_FOUR,a_sq_m1_m1);
    square_a_64qam_epi16(a_r_m1_m3,a_i_m1_m3,ch_mag_int,SQRT_42_OVER_FOUR,a_sq_m1_m3);
    square_a_64qam_epi16(a_r_m3_m1,a_i_m3_m1,ch_mag_int,SQRT_42_OVER_FOUR,a_sq_m3_m1);
    square_a_64qam_epi16(a_r_m3_m3,a_i_m3_m3,ch_mag_int,SQRT_42_OVER_FOUR,a_sq_m3_m3);

    // Computing different multiples of channel norms
    ch_mag_over_10=_mm_mulhi_epi16(ch_mag_des, ONE_OVER_TWO_SQRT_10);
    ch_mag_over_2=_mm_mulhi_epi16(ch_mag_des, SQRT_10_OVER_FOUR);
    ch_mag_over_2=_mm_slli_epi16(ch_mag_over_2, 1);
    ch_mag_9_over_10=_mm_mulhi_epi16(ch_mag_des, NINE_OVER_TWO_SQRT_10);
    ch_mag_9_over_10=_mm_slli_epi16(ch_mag_9_over_10, 2);

    // Computing Metrics
    xmm0 = _mm_subs_epi16(psi_a_p1_p1,a_sq_p1_p1);
    xmm1 = _mm_adds_epi16(xmm0,y0_p_1_1);
    bit_met_p1_p1= _mm_subs_epi16(xmm1,ch_mag_over_10);

    xmm0 = _mm_subs_epi16(psi_a_p1_p3,a_sq_p1_p3);
    xmm1 = _mm_adds_epi16(xmm0,y0_p_1_3);
    bit_met_p1_p3= _mm_subs_epi16(xmm1,ch_mag_over_2);

    xmm0 = _mm_subs_epi16(psi_a_p1_m1,a_sq_p1_m1);
    xmm1 = _mm_adds_epi16(xmm0,y0_m_1_1);
    bit_met_p1_m1= _mm_subs_epi16(xmm1,ch_mag_over_10);

    xmm0 = _mm_subs_epi16(psi_a_p1_m3,a_sq_p1_m3);
    xmm1 = _mm_adds_epi16(xmm0,y0_m_1_3);
    bit_met_p1_m3= _mm_subs_epi16(xmm1,ch_mag_over_2);

    xmm0 = _mm_subs_epi16(psi_a_p3_p1,a_sq_p3_p1);
    xmm1 = _mm_adds_epi16(xmm0,y0_p_3_1);
    bit_met_p3_p1= _mm_subs_epi16(xmm1,ch_mag_over_2);

    xmm0 = _mm_subs_epi16(psi_a_p3_p3,a_sq_p3_p3);
    xmm1 = _mm_adds_epi16(xmm0,y0_p_3_3);
    bit_met_p3_p3= _mm_subs_epi16(xmm1,ch_mag_9_over_10);

    xmm0 = _mm_subs_epi16(psi_a_p3_m1,a_sq_p3_m1);
    xmm1 = _mm_adds_epi16(xmm0,y0_m_3_1);
    bit_met_p3_m1= _mm_subs_epi16(xmm1,ch_mag_over_2);

    xmm0 = _mm_subs_epi16(psi_a_p3_m3,a_sq_p3_m3);
    xmm1 = _mm_adds_epi16(xmm0,y0_m_3_3);
    bit_met_p3_m3= _mm_subs_epi16(xmm1,ch_mag_9_over_10);

    xmm0 = _mm_subs_epi16(psi_a_m1_p1,a_sq_m1_p1);
    xmm1 = _mm_subs_epi16(xmm0,y0_m_1_1);
    bit_met_m1_p1= _mm_subs_epi16(xmm1,ch_mag_over_10);

    xmm0 = _mm_subs_epi16(psi_a_m1_p3,a_sq_m1_p3);
    xmm1 = _mm_subs_epi16(xmm0,y0_m_1_3);
    bit_met_m1_p3= _mm_subs_epi16(xmm1,ch_mag_over_2);

    xmm0 = _mm_subs_epi16(psi_a_m1_m1,a_sq_m1_m1);
    xmm1 = _mm_subs_epi16(xmm0,y0_p_1_1);
    bit_met_m1_m1= _mm_subs_epi16(xmm1,ch_mag_over_10);

    xmm0 = _mm_subs_epi16(psi_a_m1_m3,a_sq_m1_m3);
    xmm1 = _mm_subs_epi16(xmm0,y0_p_1_3);
    bit_met_m1_m3= _mm_subs_epi16(xmm1,ch_mag_over_2);

    xmm0 = _mm_subs_epi16(psi_a_m3_p1,a_sq_m3_p1);
    xmm1 = _mm_subs_epi16(xmm0,y0_m_3_1);
    bit_met_m3_p1= _mm_subs_epi16(xmm1,ch_mag_over_2);

    xmm0 = _mm_subs_epi16(psi_a_m3_p3,a_sq_m3_p3);
    xmm1 = _mm_subs_epi16(xmm0,y0_m_3_3);
    bit_met_m3_p3= _mm_subs_epi16(xmm1,ch_mag_9_over_10);

    xmm0 = _mm_subs_epi16(psi_a_m3_m1,a_sq_m3_m1);
    xmm1 = _mm_subs_epi16(xmm0,y0_p_3_1);
    bit_met_m3_m1= _mm_subs_epi16(xmm1,ch_mag_over_2);

    xmm0 = _mm_subs_epi16(psi_a_m3_m3,a_sq_m3_m3);
    xmm1 = _mm_subs_epi16(xmm0,y0_p_3_3);
    bit_met_m3_m3= _mm_subs_epi16(xmm1,ch_mag_9_over_10);

    // LLR of the first bit
    // Bit = 1
    xmm0 = _mm_max_epi16(bit_met_m1_p1,bit_met_m1_p3);
    xmm1 = _mm_max_epi16(bit_met_m1_m1,bit_met_m1_m3);
    xmm2 = _mm_max_epi16(bit_met_m3_p1,bit_met_m3_p3);
    xmm3 = _mm_max_epi16(bit_met_m3_m1,bit_met_m3_m3);
    xmm4 = _mm_max_epi16(xmm0,xmm1);
    xmm5 = _mm_max_epi16(xmm2,xmm3);
    logmax_num_re0= _mm_max_epi16(xmm4,xmm5);

    // Bit = 0
    xmm0 = _mm_max_epi16(bit_met_p1_p1,bit_met_p1_p3);
    xmm1 = _mm_max_epi16(bit_met_p1_m1,bit_met_p1_m3);
    xmm2 = _mm_max_epi16(bit_met_p3_p1,bit_met_p3_p3);
    xmm3 = _mm_max_epi16(bit_met_p3_m1,bit_met_p3_m3);
    xmm4 = _mm_max_epi16(xmm0,xmm1);
    xmm5 = _mm_max_epi16(xmm2,xmm3);
    logmax_den_re0 = _mm_max_epi16(xmm4,xmm5);

    // LLR of first bit [L1(1), L1(2), L1(3), L1(4), L1(5), L1(6), L1(7), L1(8)]
    y0r = _mm_subs_epi16(logmax_den_re0,logmax_num_re0);

    // LLR of the second bit
    // Bit = 1
    xmm0 = _mm_max_epi16(bit_met_p1_m1,bit_met_p3_m1);
    xmm1 = _mm_max_epi16(bit_met_m1_m1,bit_met_m3_m1);
    xmm2 = _mm_max_epi16(bit_met_p1_m3,bit_met_p3_m3);
    xmm3 = _mm_max_epi16(bit_met_m1_m3,bit_met_m3_m3);
    xmm4 = _mm_max_epi16(xmm0,xmm1);
    xmm5 = _mm_max_epi16(xmm2,xmm3);
    logmax_num_re1 = _mm_max_epi16(xmm4,xmm5);

    // Bit = 0
    xmm0 = _mm_max_epi16(bit_met_p1_p1,bit_met_p3_p1);
    xmm1 = _mm_max_epi16(bit_met_m1_p1,bit_met_m3_p1);
    xmm2 = _mm_max_epi16(bit_met_p1_p3,bit_met_p3_p3);
    xmm3 = _mm_max_epi16(bit_met_m1_p3,bit_met_m3_p3);
    xmm4 = _mm_max_epi16(xmm0,xmm1);
    xmm5 = _mm_max_epi16(xmm2,xmm3);
    logmax_den_re1 = _mm_max_epi16(xmm4,xmm5);

    // LLR of second bit [L2(1), L2(2), L2(3), L2(4)]
    y1r = _mm_subs_epi16(logmax_den_re1,logmax_num_re1);

    // LLR of the third bit
    // Bit = 1
    xmm0 = _mm_max_epi16(bit_met_m3_p1,bit_met_m3_p3);
    xmm1 = _mm_max_epi16(bit_met_m3_m1,bit_met_m3_m3);
    xmm2 = _mm_max_epi16(bit_met_p3_p1,bit_met_p3_p3);
    xmm3 = _mm_max_epi16(bit_met_p3_m1,bit_met_p3_m3);
    xmm4 = _mm_max_epi16(xmm0,xmm1);
    xmm5 = _mm_max_epi16(xmm2,xmm3);
    logmax_num_im0 = _mm_max_epi16(xmm4,xmm5);

    // Bit = 0
    xmm0 = _mm_max_epi16(bit_met_m1_p1,bit_met_m1_p3);
    xmm1 = _mm_max_epi16(bit_met_m1_m1,bit_met_m1_m3);
    xmm2 = _mm_max_epi16(bit_met_p1_p1,bit_met_p1_p3);
    xmm3 = _mm_max_epi16(bit_met_p1_m1,bit_met_p1_m3);
    xmm4 = _mm_max_epi16(xmm0,xmm1);
    xmm5 = _mm_max_epi16(xmm2,xmm3);
    logmax_den_im0 = _mm_max_epi16(xmm4,xmm5);

    // LLR of third bit [L3(1), L3(2), L3(3), L3(4)]
    y0i = _mm_subs_epi16(logmax_den_im0,logmax_num_im0);

    // LLR of the fourth bit
    // Bit = 1
    xmm0 = _mm_max_epi16(bit_met_p1_m3,bit_met_p3_m3);
    xmm1 = _mm_max_epi16(bit_met_m1_m3,bit_met_m3_m3);
    xmm2 = _mm_max_epi16(bit_met_p1_p3,bit_met_p3_p3);
    xmm3 = _mm_max_epi16(bit_met_m1_p3,bit_met_m3_p3);
    xmm4 = _mm_max_epi16(xmm0,xmm1);
    xmm5 = _mm_max_epi16(xmm2,xmm3);
    logmax_num_im1 = _mm_max_epi16(xmm4,xmm5);

    // Bit = 0
    xmm0 = _mm_max_epi16(bit_met_p1_m1,bit_met_p3_m1);
    xmm1 = _mm_max_epi16(bit_met_m1_m1,bit_met_m3_m1);
    xmm2 = _mm_max_epi16(bit_met_p1_p1,bit_met_p3_p1);
    xmm3 = _mm_max_epi16(bit_met_m1_p1,bit_met_m3_p1);
    xmm4 = _mm_max_epi16(xmm0,xmm1);
    xmm5 = _mm_max_epi16(xmm2,xmm3);
    logmax_den_im1 = _mm_max_epi16(xmm4,xmm5);

    // LLR of fourth bit [L4(1), L4(2), L4(3), L4(4)]
    y1i = _mm_subs_epi16(logmax_den_im1,logmax_num_im1);

    // Pack LLRs in output
    // [L1(1), L2(1), L1(2), L2(2), L1(3), L2(3), L1(4), L2(4)]
    xmm0 = _mm_unpacklo_epi16(y0r,y1r);
    // [L1(5), L2(5), L1(6), L2(6), L1(7), L2(7), L1(8), L2(8)]
    xmm1 = _mm_unpackhi_epi16(y0r,y1r);
    // [L3(1), L4(1), L3(2), L4(2), L3(3), L4(3), L3(4), L4(4)]
    xmm2 = _mm_unpacklo_epi16(y0i,y1i);
    // [L3(5), L4(5), L3(6), L4(6), L3(7), L4(7), L3(8), L4(8)]
    xmm3 = _mm_unpackhi_epi16(y0i,y1i);

    stream0_128i_out[2*i+0] = _mm_unpacklo_epi32(xmm0,xmm2); // 8LLRs, 2REs
    stream0_128i_out[2*i+1] = _mm_unpackhi_epi32(xmm0,xmm2);
    stream0_128i_out[2*i+2] = _mm_unpacklo_epi32(xmm1,xmm3);
    stream0_128i_out[2*i+3] = _mm_unpackhi_epi32(xmm1,xmm3);
#elif defined(__arm__)

#endif

  }

#if defined(__x86_64__) || defined(__i386__)
  _mm_empty();
  _m_empty();
#endif
}

int nr_dlsch_16qam_64qam_llr(NR_DL_FRAME_PARMS *frame_parms,
                          int32_t **rxdataF_comp,
                          int32_t **rxdataF_comp_i,
                          int32_t **dl_ch_mag,   //|h_0|^2*(2/sqrt{10})
                          int32_t **dl_ch_mag_i, //|h_1|^2*(2/sqrt{10})
                          int32_t **rho_i,
                          int16_t *dlsch_llr,
                          uint8_t symbol,
                          uint8_t first_symbol_flag,
                          uint16_t nb_rb,
                          uint16_t pbch_pss_sss_adjust,
                          int16_t **llr16p)
{

  int16_t *rxF      = (int16_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
  int16_t *rxF_i    = (int16_t*)&rxdataF_comp_i[0][(symbol*frame_parms->N_RB_DL*12)];
  int16_t *ch_mag   = (int16_t*)&dl_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)];
  int16_t *ch_mag_i = (int16_t*)&dl_ch_mag_i[0][(symbol*frame_parms->N_RB_DL*12)];
  int16_t *rho      = (int16_t*)&rho_i[0][(symbol*frame_parms->N_RB_DL*12)];
  int16_t *llr16;
  int len;
  uint8_t symbol_mod = (symbol >= (7-frame_parms->Ncp))? (symbol-(7-frame_parms->Ncp)) : symbol;

  // first symbol has different structure due to more pilots
  if (first_symbol_flag == 1) {
    llr16 = (int16_t*)dlsch_llr;
  } else {
    llr16 = (int16_t*)(*llr16p);
  }


  AssertFatal(llr16!=NULL,"nr_dlsch_16qam_64qam_llr:llr is null, symbol %d\n",symbol);


  if ((symbol_mod==0) || (symbol_mod==(4-frame_parms->Ncp))) {
    // if symbol has pilots
    if (frame_parms->nb_antenna_ports_gNB!=1)
      // in 2 antenna ports we have 8 REs per symbol per RB
      len = (nb_rb*8) - (2*pbch_pss_sss_adjust/3);
    else
      // for 1 antenna port we have 10 REs per symbol per RB
      len = (nb_rb*10) - (5*pbch_pss_sss_adjust/6);
  } else {
    // symbol has no pilots
    len = (nb_rb*12) - pbch_pss_sss_adjust;
  }

  // printf("symbol %d: qam16_llr, len %d (llr16 %p)\n",symbol,len,llr16);

  nr_qam16_qam64((short *)rxF,
              (short *)rxF_i,
              (short *)ch_mag,
              (short *)ch_mag_i,
              (short *)llr16,
              (short *)rho,
              len);

  llr16 += (len<<2);
  *llr16p = (short *)llr16;

  return(0);
}

//----------------------------------------------------------------------------------------------
// 64-QAM
//----------------------------------------------------------------------------------------------

/*
__m128i ONE_OVER_SQRT_42 __attribute__((aligned(16)));
__m128i THREE_OVER_SQRT_42 __attribute__((aligned(16)));
__m128i FIVE_OVER_SQRT_42 __attribute__((aligned(16)));
__m128i SEVEN_OVER_SQRT_42 __attribute__((aligned(16)));

__m128i FORTYNINE_OVER_FOUR_SQRT_42 __attribute__((aligned(16)));
__m128i THIRTYSEVEN_OVER_FOUR_SQRT_42 __attribute__((aligned(16)));
__m128i TWENTYNINE_OVER_FOUR_SQRT_42 __attribute__((aligned(16)));
__m128i TWENTYFIVE_OVER_FOUR_SQRT_42 __attribute__((aligned(16)));
__m128i SEVENTEEN_OVER_FOUR_SQRT_42 __attribute__((aligned(16)));
__m128i NINE_OVER_FOUR_SQRT_42 __attribute__((aligned(16)));
__m128i THIRTEEN_OVER_FOUR_SQRT_42 __attribute__((aligned(16)));
__m128i FIVE_OVER_FOUR_SQRT_42 __attribute__((aligned(16)));
__m128i ONE_OVER_FOUR_SQRT_42 __attribute__((aligned(16)));

__m128i  y0r_one_over_sqrt_21 __attribute__((aligned(16)));
__m128i  y0r_three_over_sqrt_21 __attribute__((aligned(16)));
__m128i  y0r_five_over_sqrt_21 __attribute__((aligned(16)));
__m128i  y0r_seven_over_sqrt_21 __attribute__((aligned(16)));
__m128i  y0i_one_over_sqrt_21 __attribute__((aligned(16)));
__m128i  y0i_three_over_sqrt_21 __attribute__((aligned(16)));
__m128i  y0i_five_over_sqrt_21 __attribute__((aligned(16)));
__m128i  y0i_seven_over_sqrt_21 __attribute__((aligned(16)));

__m128i ch_mag_98_over_42_with_sigma2 __attribute__((aligned(16)));
__m128i ch_mag_74_over_42_with_sigma2 __attribute__((aligned(16)));
__m128i ch_mag_58_over_42_with_sigma2 __attribute__((aligned(16)));
__m128i ch_mag_50_over_42_with_sigma2 __attribute__((aligned(16)));
__m128i ch_mag_34_over_42_with_sigma2 __attribute__((aligned(16)));
__m128i ch_mag_18_over_42_with_sigma2 __attribute__((aligned(16)));
__m128i ch_mag_26_over_42_with_sigma2 __attribute__((aligned(16)));
__m128i ch_mag_10_over_42_with_sigma2 __attribute__((aligned(16)));
__m128i ch_mag_2_over_42_with_sigma2 __attribute__((aligned(16)));

*/

void nr_qam64_qpsk(int16_t *stream0_in,
                int16_t *stream1_in,
                int16_t *ch_mag,
                int16_t *stream0_out,
                int16_t *rho01,
                int32_t length
    )
{

  /*
    Author: S. Wagner
    Date: 31-07-12

    Input:
    stream0_in:  MF filter for 1st stream, i.e., y0=h0'*y
    stream1_in:  MF filter for 2nd stream, i.e., y1=h1'*y
    ch_mag:      4*h0/sqrt(42), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
    ch_mag_i:    4*h1/sqrt(42), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
    rho01:       Channel cross correlation, i.e., h1'*h0

    Output:
    stream0_out: output LLRs for 1st stream
  */

#if defined(__x86_64__) || defined(__i386__)
  __m128i *rho01_128i      = (__m128i *)rho01;
  __m128i *stream0_128i_in = (__m128i *)stream0_in;
  __m128i *stream1_128i_in = (__m128i *)stream1_in;
  __m128i *ch_mag_128i     = (__m128i *)ch_mag;


  __m128i ONE_OVER_SQRT_42 = _mm_set1_epi16(10112); // round(1/sqrt(42)*2^16)
  __m128i THREE_OVER_SQRT_42 = _mm_set1_epi16(30337); // round(3/sqrt(42)*2^16)
  __m128i FIVE_OVER_SQRT_42 = _mm_set1_epi16(25281); // round(5/sqrt(42)*2^15)
  __m128i SEVEN_OVER_SQRT_42 = _mm_set1_epi16(17697); // round(5/sqrt(42)*2^15)
  __m128i ONE_OVER_SQRT_2 = _mm_set1_epi16(23170); // round(1/sqrt(2)*2^15)
  __m128i FORTYNINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(30969); // round(49/(4*sqrt(42))*2^14), Q2.14
  __m128i THIRTYSEVEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(23385); // round(37/(4*sqrt(42))*2^14), Q2.14
  __m128i TWENTYFIVE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(31601); // round(25/(4*sqrt(42))*2^15)
  __m128i TWENTYNINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(18329); // round(29/(4*sqrt(42))*2^15), Q2.14
  __m128i SEVENTEEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(21489); // round(17/(4*sqrt(42))*2^15)
  __m128i NINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(11376); // round(9/(4*sqrt(42))*2^15)
  __m128i THIRTEEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(16433); // round(13/(4*sqrt(42))*2^15)
  __m128i FIVE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(6320); // round(5/(4*sqrt(42))*2^15)
  __m128i ONE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(1264); // round(1/(4*sqrt(42))*2^15)


  __m128i ch_mag_des;
  __m128i ch_mag_98_over_42_with_sigma2;
  __m128i ch_mag_74_over_42_with_sigma2;
  __m128i ch_mag_58_over_42_with_sigma2;
  __m128i ch_mag_50_over_42_with_sigma2;
  __m128i ch_mag_34_over_42_with_sigma2;
  __m128i ch_mag_18_over_42_with_sigma2;
  __m128i ch_mag_26_over_42_with_sigma2;
  __m128i ch_mag_10_over_42_with_sigma2;
  __m128i ch_mag_2_over_42_with_sigma2;
  __m128i  y0r_one_over_sqrt_21;
  __m128i  y0r_three_over_sqrt_21;
  __m128i  y0r_five_over_sqrt_21;
  __m128i  y0r_seven_over_sqrt_21;
  __m128i  y0i_one_over_sqrt_21;
  __m128i  y0i_three_over_sqrt_21;
  __m128i  y0i_five_over_sqrt_21;
  __m128i  y0i_seven_over_sqrt_21;
#elif defined(__arm__)

#endif

  int i,j;

  for (i=0; i<length>>2; i+=2) {

#if defined(__x86_64) || defined(__i386__)
    // Get rho
    xmm0 = rho01_128i[i];
    xmm1 = rho01_128i[i+1];
    xmm0 = _mm_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    //xmm0 = [Re(0,1) Re(2,3) Im(0,1) Im(2,3)]
    //xmm1 = [Re(4,5) Re(6,7) Im(4,5) Im(6,7)]
    xmm2 = _mm_unpacklo_epi64(xmm0,xmm1); // Re(rho)
    xmm3 = _mm_unpackhi_epi64(xmm0,xmm1); // Im(rho)
    rho_rpi = _mm_adds_epi16(xmm2,xmm3); // rho = Re(rho) + Im(rho)
    rho_rmi = _mm_subs_epi16(xmm2,xmm3); // rho* = Re(rho) - Im(rho)

    // Compute the different rhos
    rho_rpi_1_1 = _mm_mulhi_epi16(rho_rpi, ONE_OVER_SQRT_42);
    rho_rmi_1_1 = _mm_mulhi_epi16(rho_rmi, ONE_OVER_SQRT_42);
    rho_rpi_3_3 = _mm_mulhi_epi16(rho_rpi, THREE_OVER_SQRT_42);
    rho_rmi_3_3 = _mm_mulhi_epi16(rho_rmi, THREE_OVER_SQRT_42);
    rho_rpi_5_5 = _mm_mulhi_epi16(rho_rpi, FIVE_OVER_SQRT_42);
    rho_rmi_5_5 = _mm_mulhi_epi16(rho_rmi, FIVE_OVER_SQRT_42);
    rho_rpi_7_7 = _mm_mulhi_epi16(rho_rpi, SEVEN_OVER_SQRT_42);
    rho_rmi_7_7 = _mm_mulhi_epi16(rho_rmi, SEVEN_OVER_SQRT_42);

    rho_rpi_5_5 = _mm_slli_epi16(rho_rpi_5_5, 1);
    rho_rmi_5_5 = _mm_slli_epi16(rho_rmi_5_5, 1);
    rho_rpi_7_7 = _mm_slli_epi16(rho_rpi_7_7, 2);
    rho_rmi_7_7 = _mm_slli_epi16(rho_rmi_7_7, 2);

    xmm4 = _mm_mulhi_epi16(xmm2, ONE_OVER_SQRT_42);
    xmm5 = _mm_mulhi_epi16(xmm3, ONE_OVER_SQRT_42);
    xmm6 = _mm_mulhi_epi16(xmm3, THREE_OVER_SQRT_42);
    xmm7 = _mm_mulhi_epi16(xmm3, FIVE_OVER_SQRT_42);
    xmm8 = _mm_mulhi_epi16(xmm3, SEVEN_OVER_SQRT_42);
    xmm7 = _mm_slli_epi16(xmm7, 1);
    xmm8 = _mm_slli_epi16(xmm8, 2);

    rho_rpi_1_3 = _mm_adds_epi16(xmm4, xmm6);
    rho_rmi_1_3 = _mm_subs_epi16(xmm4, xmm6);
    rho_rpi_1_5 = _mm_adds_epi16(xmm4, xmm7);
    rho_rmi_1_5 = _mm_subs_epi16(xmm4, xmm7);
    rho_rpi_1_7 = _mm_adds_epi16(xmm4, xmm8);
    rho_rmi_1_7 = _mm_subs_epi16(xmm4, xmm8);

    xmm4 = _mm_mulhi_epi16(xmm2, THREE_OVER_SQRT_42);
    rho_rpi_3_1 = _mm_adds_epi16(xmm4, xmm5);
    rho_rmi_3_1 = _mm_subs_epi16(xmm4, xmm5);
    rho_rpi_3_5 = _mm_adds_epi16(xmm4, xmm7);
    rho_rmi_3_5 = _mm_subs_epi16(xmm4, xmm7);
    rho_rpi_3_7 = _mm_adds_epi16(xmm4, xmm8);
    rho_rmi_3_7 = _mm_subs_epi16(xmm4, xmm8);

    xmm4 = _mm_mulhi_epi16(xmm2, FIVE_OVER_SQRT_42);
    xmm4 = _mm_slli_epi16(xmm4, 1);
    rho_rpi_5_1 = _mm_adds_epi16(xmm4, xmm5);
    rho_rmi_5_1 = _mm_subs_epi16(xmm4, xmm5);
    rho_rpi_5_3 = _mm_adds_epi16(xmm4, xmm6);
    rho_rmi_5_3 = _mm_subs_epi16(xmm4, xmm6);
    rho_rpi_5_7 = _mm_adds_epi16(xmm4, xmm8);
    rho_rmi_5_7 = _mm_subs_epi16(xmm4, xmm8);

    xmm4 = _mm_mulhi_epi16(xmm2, SEVEN_OVER_SQRT_42);
    xmm4 = _mm_slli_epi16(xmm4, 2);
    rho_rpi_7_1 = _mm_adds_epi16(xmm4, xmm5);
    rho_rmi_7_1 = _mm_subs_epi16(xmm4, xmm5);
    rho_rpi_7_3 = _mm_adds_epi16(xmm4, xmm6);
    rho_rmi_7_3 = _mm_subs_epi16(xmm4, xmm6);
    rho_rpi_7_5 = _mm_adds_epi16(xmm4, xmm7);
    rho_rmi_7_5 = _mm_subs_epi16(xmm4, xmm7);

    // Rearrange interfering MF output
    xmm0 = stream1_128i_in[i];
    xmm1 = stream1_128i_in[i+1];
    xmm0 = _mm_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    //xmm0 = [Re(0,1) Re(2,3) Im(0,1) Im(2,3)]
    //xmm1 = [Re(4,5) Re(6,7) Im(4,5) Im(6,7)]
    y1r = _mm_unpacklo_epi64(xmm0,xmm1); //[y1r(1),y1r(2),y1r(3),y1r(4)]
    y1i = _mm_unpackhi_epi64(xmm0,xmm1); //[y1i(1),y1i(2),y1i(3),y1i(4)]

    // Psi_r calculation from rho_rpi or rho_rmi
    xmm0 = _mm_setzero_si128(); // ZERO for abs_pi16
    xmm2 = _mm_subs_epi16(rho_rpi_7_7, y1r);
    psi_r_p7_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_7_5, y1r);
    psi_r_p7_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_7_3, y1r);
    psi_r_p7_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_7_1, y1r);
    psi_r_p7_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_7_1, y1r);
    psi_r_p7_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_7_3, y1r);
    psi_r_p7_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_7_5, y1r);
    psi_r_p7_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_7_7, y1r);
    psi_r_p7_m7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_5_7, y1r);
    psi_r_p5_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_5_5, y1r);
    psi_r_p5_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_5_3, y1r);
    psi_r_p5_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_5_1, y1r);
    psi_r_p5_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_5_1, y1r);
    psi_r_p5_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_5_3, y1r);
    psi_r_p5_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_5_5, y1r);
    psi_r_p5_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_5_7, y1r);
    psi_r_p5_m7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_3_7, y1r);
    psi_r_p3_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_3_5, y1r);
    psi_r_p3_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_3_3, y1r);
    psi_r_p3_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_3_1, y1r);
    psi_r_p3_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_3_1, y1r);
    psi_r_p3_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_3_3, y1r);
    psi_r_p3_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_3_5, y1r);
    psi_r_p3_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_3_7, y1r);
    psi_r_p3_m7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_1_7, y1r);
    psi_r_p1_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_1_5, y1r);
    psi_r_p1_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_1_3, y1r);
    psi_r_p1_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_1_1, y1r);
    psi_r_p1_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_1_1, y1r);
    psi_r_p1_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_1_3, y1r);
    psi_r_p1_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_1_5, y1r);
    psi_r_p1_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_1_7, y1r);
    psi_r_p1_m7 = _mm_abs_epi16(xmm2);

    xmm2 = _mm_adds_epi16(rho_rmi_1_7, y1r);
    psi_r_m1_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_1_5, y1r);
    psi_r_m1_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_1_3, y1r);
    psi_r_m1_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_1_1, y1r);
    psi_r_m1_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_1_1, y1r);
    psi_r_m1_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_1_3, y1r);
    psi_r_m1_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_1_5, y1r);
    psi_r_m1_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_1_7, y1r);
    psi_r_m1_m7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_3_7, y1r);
    psi_r_m3_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_3_5, y1r);
    psi_r_m3_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_3_3, y1r);
    psi_r_m3_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_3_1, y1r);
    psi_r_m3_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_3_1, y1r);
    psi_r_m3_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_3_3, y1r);
    psi_r_m3_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_3_5, y1r);
    psi_r_m3_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_3_7, y1r);
    psi_r_m3_m7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_5_7, y1r);
    psi_r_m5_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_5_5, y1r);
    psi_r_m5_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_5_3, y1r);
    psi_r_m5_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_5_1, y1r);
    psi_r_m5_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_5_1, y1r);
    psi_r_m5_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_5_3, y1r);
    psi_r_m5_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_5_5, y1r);
    psi_r_m5_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_5_7, y1r);
    psi_r_m5_m7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_7_7, y1r);
    psi_r_m7_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_7_5, y1r);
    psi_r_m7_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_7_3, y1r);
    psi_r_m7_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_7_1, y1r);
    psi_r_m7_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_7_1, y1r);
    psi_r_m7_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_7_3, y1r);
    psi_r_m7_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_7_5, y1r);
    psi_r_m7_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_7_7, y1r);
    psi_r_m7_m7 = _mm_abs_epi16(xmm2);

    // Psi_i calculation from rho_rpi or rho_rmi
    xmm2 = _mm_subs_epi16(rho_rmi_7_7, y1i);
    psi_i_p7_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_5_7, y1i);
    psi_i_p7_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_3_7, y1i);
    psi_i_p7_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_1_7, y1i);
    psi_i_p7_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_1_7, y1i);
    psi_i_p7_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_3_7, y1i);
    psi_i_p7_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_5_7, y1i);
    psi_i_p7_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_7_7, y1i);
    psi_i_p7_m7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_7_5, y1i);
    psi_i_p5_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_5_5, y1i);
    psi_i_p5_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_3_5, y1i);
    psi_i_p5_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_1_5, y1i);
    psi_i_p5_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_1_5, y1i);
    psi_i_p5_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_3_5, y1i);
    psi_i_p5_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_5_5, y1i);
    psi_i_p5_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_7_5, y1i);
    psi_i_p5_m7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_7_3, y1i);
    psi_i_p3_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_5_3, y1i);
    psi_i_p3_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_3_3, y1i);
    psi_i_p3_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_1_3, y1i);
    psi_i_p3_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_1_3, y1i);
    psi_i_p3_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_3_3, y1i);
    psi_i_p3_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_5_3, y1i);
    psi_i_p3_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_7_3, y1i);
    psi_i_p3_m7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_7_1, y1i);
    psi_i_p1_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_5_1, y1i);
    psi_i_p1_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_3_1, y1i);
    psi_i_p1_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_1_1, y1i);
    psi_i_p1_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_1_1, y1i);
    psi_i_p1_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_3_1, y1i);
    psi_i_p1_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_5_1, y1i);
    psi_i_p1_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_7_1, y1i);
    psi_i_p1_m7 = _mm_abs_epi16(xmm2);

    xmm2 = _mm_subs_epi16(rho_rpi_7_1, y1i);
    psi_i_m1_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_5_1, y1i);
    psi_i_m1_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_3_1, y1i);
    psi_i_m1_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_1_1, y1i);
    psi_i_m1_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_1_1, y1i);
    psi_i_m1_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_3_1, y1i);
    psi_i_m1_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_5_1, y1i);
    psi_i_m1_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_7_1, y1i);
    psi_i_m1_m7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_7_3, y1i);
    psi_i_m3_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_5_3, y1i);
    psi_i_m3_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_3_3, y1i);
    psi_i_m3_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_1_3, y1i);
    psi_i_m3_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_1_3, y1i);
    psi_i_m3_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_3_3, y1i);
    psi_i_m3_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_5_3, y1i);
    psi_i_m3_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_7_3, y1i);
    psi_i_m3_m7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_7_5, y1i);
    psi_i_m5_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_5_5, y1i);
    psi_i_m5_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_3_5, y1i);
    psi_i_m5_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_1_5, y1i);
    psi_i_m5_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_1_5, y1i);
    psi_i_m5_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_3_5, y1i);
    psi_i_m5_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_5_5, y1i);
    psi_i_m5_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_7_5, y1i);
    psi_i_m5_m7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_7_7, y1i);
    psi_i_m7_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_5_7, y1i);
    psi_i_m7_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_3_7, y1i);
    psi_i_m7_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_1_7, y1i);
    psi_i_m7_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_1_7, y1i);
    psi_i_m7_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_3_7, y1i);
    psi_i_m7_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_5_7, y1i);
    psi_i_m7_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_7_7, y1i);
    psi_i_m7_m7 = _mm_abs_epi16(xmm2);


    // Rearrange desired MF output
    xmm0 = stream0_128i_in[i];
    xmm1 = stream0_128i_in[i+1];
    xmm0 = _mm_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    //xmm0 = [Re(0,1) Re(2,3) Im(0,1) Im(2,3)]
    //xmm1 = [Re(4,5) Re(6,7) Im(4,5) Im(6,7)]
    y0r = _mm_unpacklo_epi64(xmm0,xmm1); // = [y0r(1),y0r(2),y0r(3),y0r(4)]
    y0i = _mm_unpackhi_epi64(xmm0,xmm1);

    // Rearrange desired channel magnitudes
    xmm2 = ch_mag_128i[i]; // = [|h|^2(1),|h|^2(1),|h|^2(2),|h|^2(2)]*(2/sqrt(10))
    xmm3 = ch_mag_128i[i+1]; // = [|h|^2(3),|h|^2(3),|h|^2(4),|h|^2(4)]*(2/sqrt(10))
    xmm2 = _mm_shufflelo_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm2 = _mm_shufflehi_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm2 = _mm_shuffle_epi32(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shufflelo_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shufflehi_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shuffle_epi32(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    ch_mag_des = _mm_unpacklo_epi64(xmm2,xmm3);

    y0r_one_over_sqrt_21   = _mm_mulhi_epi16(y0r, ONE_OVER_SQRT_42);
    y0r_three_over_sqrt_21 = _mm_mulhi_epi16(y0r, THREE_OVER_SQRT_42);
    y0r_five_over_sqrt_21  = _mm_mulhi_epi16(y0r, FIVE_OVER_SQRT_42);
    y0r_five_over_sqrt_21  = _mm_slli_epi16(y0r_five_over_sqrt_21, 1);
    y0r_seven_over_sqrt_21 = _mm_mulhi_epi16(y0r, SEVEN_OVER_SQRT_42);
    y0r_seven_over_sqrt_21 = _mm_slli_epi16(y0r_seven_over_sqrt_21, 2); // Q2.14

    y0i_one_over_sqrt_21   = _mm_mulhi_epi16(y0i, ONE_OVER_SQRT_42);
    y0i_three_over_sqrt_21 = _mm_mulhi_epi16(y0i, THREE_OVER_SQRT_42);
    y0i_five_over_sqrt_21  = _mm_mulhi_epi16(y0i, FIVE_OVER_SQRT_42);
    y0i_five_over_sqrt_21  = _mm_slli_epi16(y0i_five_over_sqrt_21, 1);
    y0i_seven_over_sqrt_21 = _mm_mulhi_epi16(y0i, SEVEN_OVER_SQRT_42);
    y0i_seven_over_sqrt_21 = _mm_slli_epi16(y0i_seven_over_sqrt_21, 2); // Q2.14

    y0_p_7_1 = _mm_adds_epi16(y0r_seven_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_p_7_3 = _mm_adds_epi16(y0r_seven_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_p_7_5 = _mm_adds_epi16(y0r_seven_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_p_7_7 = _mm_adds_epi16(y0r_seven_over_sqrt_21, y0i_seven_over_sqrt_21);
    y0_p_5_1 = _mm_adds_epi16(y0r_five_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_p_5_3 = _mm_adds_epi16(y0r_five_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_p_5_5 = _mm_adds_epi16(y0r_five_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_p_5_7 = _mm_adds_epi16(y0r_five_over_sqrt_21, y0i_seven_over_sqrt_21);
    y0_p_3_1 = _mm_adds_epi16(y0r_three_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_p_3_3 = _mm_adds_epi16(y0r_three_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_p_3_5 = _mm_adds_epi16(y0r_three_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_p_3_7 = _mm_adds_epi16(y0r_three_over_sqrt_21, y0i_seven_over_sqrt_21);
    y0_p_1_1 = _mm_adds_epi16(y0r_one_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_p_1_3 = _mm_adds_epi16(y0r_one_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_p_1_5 = _mm_adds_epi16(y0r_one_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_p_1_7 = _mm_adds_epi16(y0r_one_over_sqrt_21, y0i_seven_over_sqrt_21);

    y0_m_1_1 = _mm_subs_epi16(y0r_one_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_m_1_3 = _mm_subs_epi16(y0r_one_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_m_1_5 = _mm_subs_epi16(y0r_one_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_m_1_7 = _mm_subs_epi16(y0r_one_over_sqrt_21, y0i_seven_over_sqrt_21);
    y0_m_3_1 = _mm_subs_epi16(y0r_three_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_m_3_3 = _mm_subs_epi16(y0r_three_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_m_3_5 = _mm_subs_epi16(y0r_three_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_m_3_7 = _mm_subs_epi16(y0r_three_over_sqrt_21, y0i_seven_over_sqrt_21);
    y0_m_5_1 = _mm_subs_epi16(y0r_five_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_m_5_3 = _mm_subs_epi16(y0r_five_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_m_5_5 = _mm_subs_epi16(y0r_five_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_m_5_7 = _mm_subs_epi16(y0r_five_over_sqrt_21, y0i_seven_over_sqrt_21);
    y0_m_7_1 = _mm_subs_epi16(y0r_seven_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_m_7_3 = _mm_subs_epi16(y0r_seven_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_m_7_5 = _mm_subs_epi16(y0r_seven_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_m_7_7 = _mm_subs_epi16(y0r_seven_over_sqrt_21, y0i_seven_over_sqrt_21);

    // divide by sqrt(2)
    psi_r_p7_p7 = _mm_mulhi_epi16(psi_r_p7_p7, ONE_OVER_SQRT_2);
    psi_r_p7_p7 = _mm_slli_epi16(psi_r_p7_p7, 1);
    psi_r_p7_p5 = _mm_mulhi_epi16(psi_r_p7_p5, ONE_OVER_SQRT_2);
    psi_r_p7_p5 = _mm_slli_epi16(psi_r_p7_p5, 1);
    psi_r_p7_p3 = _mm_mulhi_epi16(psi_r_p7_p3, ONE_OVER_SQRT_2);
    psi_r_p7_p3 = _mm_slli_epi16(psi_r_p7_p3, 1);
    psi_r_p7_p1 = _mm_mulhi_epi16(psi_r_p7_p1, ONE_OVER_SQRT_2);
    psi_r_p7_p1 = _mm_slli_epi16(psi_r_p7_p1, 1);
    psi_r_p7_m1 = _mm_mulhi_epi16(psi_r_p7_m1, ONE_OVER_SQRT_2);
    psi_r_p7_m1 = _mm_slli_epi16(psi_r_p7_m1, 1);
    psi_r_p7_m3 = _mm_mulhi_epi16(psi_r_p7_m3, ONE_OVER_SQRT_2);
    psi_r_p7_m3 = _mm_slli_epi16(psi_r_p7_m3, 1);
    psi_r_p7_m5 = _mm_mulhi_epi16(psi_r_p7_m5, ONE_OVER_SQRT_2);
    psi_r_p7_m5 = _mm_slli_epi16(psi_r_p7_m5, 1);
    psi_r_p7_m7 = _mm_mulhi_epi16(psi_r_p7_m7, ONE_OVER_SQRT_2);
    psi_r_p7_m7 = _mm_slli_epi16(psi_r_p7_m7, 1);
    psi_r_p5_p7 = _mm_mulhi_epi16(psi_r_p5_p7, ONE_OVER_SQRT_2);
    psi_r_p5_p7 = _mm_slli_epi16(psi_r_p5_p7, 1);
    psi_r_p5_p5 = _mm_mulhi_epi16(psi_r_p5_p5, ONE_OVER_SQRT_2);
    psi_r_p5_p5 = _mm_slli_epi16(psi_r_p5_p5, 1);
    psi_r_p5_p3 = _mm_mulhi_epi16(psi_r_p5_p3, ONE_OVER_SQRT_2);
    psi_r_p5_p3 = _mm_slli_epi16(psi_r_p5_p3, 1);
    psi_r_p5_p1 = _mm_mulhi_epi16(psi_r_p5_p1, ONE_OVER_SQRT_2);
    psi_r_p5_p1 = _mm_slli_epi16(psi_r_p5_p1, 1);
    psi_r_p5_m1 = _mm_mulhi_epi16(psi_r_p5_m1, ONE_OVER_SQRT_2);
    psi_r_p5_m1 = _mm_slli_epi16(psi_r_p5_m1, 1);
    psi_r_p5_m3 = _mm_mulhi_epi16(psi_r_p5_m3, ONE_OVER_SQRT_2);
    psi_r_p5_m3 = _mm_slli_epi16(psi_r_p5_m3, 1);
    psi_r_p5_m5 = _mm_mulhi_epi16(psi_r_p5_m5, ONE_OVER_SQRT_2);
    psi_r_p5_m5 = _mm_slli_epi16(psi_r_p5_m5, 1);
    psi_r_p5_m7 = _mm_mulhi_epi16(psi_r_p5_m7, ONE_OVER_SQRT_2);
    psi_r_p5_m7 = _mm_slli_epi16(psi_r_p5_m7, 1);
    psi_r_p3_p7 = _mm_mulhi_epi16(psi_r_p3_p7, ONE_OVER_SQRT_2);
    psi_r_p3_p7 = _mm_slli_epi16(psi_r_p3_p7, 1);
    psi_r_p3_p5 = _mm_mulhi_epi16(psi_r_p3_p5, ONE_OVER_SQRT_2);
    psi_r_p3_p5 = _mm_slli_epi16(psi_r_p3_p5, 1);
    psi_r_p3_p3 = _mm_mulhi_epi16(psi_r_p3_p3, ONE_OVER_SQRT_2);
    psi_r_p3_p3 = _mm_slli_epi16(psi_r_p3_p3, 1);
    psi_r_p3_p1 = _mm_mulhi_epi16(psi_r_p3_p1, ONE_OVER_SQRT_2);
    psi_r_p3_p1 = _mm_slli_epi16(psi_r_p3_p1, 1);
    psi_r_p3_m1 = _mm_mulhi_epi16(psi_r_p3_m1, ONE_OVER_SQRT_2);
    psi_r_p3_m1 = _mm_slli_epi16(psi_r_p3_m1, 1);
    psi_r_p3_m3 = _mm_mulhi_epi16(psi_r_p3_m3, ONE_OVER_SQRT_2);
    psi_r_p3_m3 = _mm_slli_epi16(psi_r_p3_m3, 1);
    psi_r_p3_m5 = _mm_mulhi_epi16(psi_r_p3_m5, ONE_OVER_SQRT_2);
    psi_r_p3_m5 = _mm_slli_epi16(psi_r_p3_m5, 1);
    psi_r_p3_m7 = _mm_mulhi_epi16(psi_r_p3_m7, ONE_OVER_SQRT_2);
    psi_r_p3_m7 = _mm_slli_epi16(psi_r_p3_m7, 1);
    psi_r_p1_p7 = _mm_mulhi_epi16(psi_r_p1_p7, ONE_OVER_SQRT_2);
    psi_r_p1_p7 = _mm_slli_epi16(psi_r_p1_p7, 1);
    psi_r_p1_p5 = _mm_mulhi_epi16(psi_r_p1_p5, ONE_OVER_SQRT_2);
    psi_r_p1_p5 = _mm_slli_epi16(psi_r_p1_p5, 1);
    psi_r_p1_p3 = _mm_mulhi_epi16(psi_r_p1_p3, ONE_OVER_SQRT_2);
    psi_r_p1_p3 = _mm_slli_epi16(psi_r_p1_p3, 1);
    psi_r_p1_p1 = _mm_mulhi_epi16(psi_r_p1_p1, ONE_OVER_SQRT_2);
    psi_r_p1_p1 = _mm_slli_epi16(psi_r_p1_p1, 1);
    psi_r_p1_m1 = _mm_mulhi_epi16(psi_r_p1_m1, ONE_OVER_SQRT_2);
    psi_r_p1_m1 = _mm_slli_epi16(psi_r_p1_m1, 1);
    psi_r_p1_m3 = _mm_mulhi_epi16(psi_r_p1_m3, ONE_OVER_SQRT_2);
    psi_r_p1_m3 = _mm_slli_epi16(psi_r_p1_m3, 1);
    psi_r_p1_m5 = _mm_mulhi_epi16(psi_r_p1_m5, ONE_OVER_SQRT_2);
    psi_r_p1_m5 = _mm_slli_epi16(psi_r_p1_m5, 1);
    psi_r_p1_m7 = _mm_mulhi_epi16(psi_r_p1_m7, ONE_OVER_SQRT_2);
    psi_r_p1_m7 = _mm_slli_epi16(psi_r_p1_m7, 1);
    psi_r_m1_p7 = _mm_mulhi_epi16(psi_r_m1_p7, ONE_OVER_SQRT_2);
    psi_r_m1_p7 = _mm_slli_epi16(psi_r_m1_p7, 1);
    psi_r_m1_p5 = _mm_mulhi_epi16(psi_r_m1_p5, ONE_OVER_SQRT_2);
    psi_r_m1_p5 = _mm_slli_epi16(psi_r_m1_p5, 1);
    psi_r_m1_p3 = _mm_mulhi_epi16(psi_r_m1_p3, ONE_OVER_SQRT_2);
    psi_r_m1_p3 = _mm_slli_epi16(psi_r_m1_p3, 1);
    psi_r_m1_p1 = _mm_mulhi_epi16(psi_r_m1_p1, ONE_OVER_SQRT_2);
    psi_r_m1_p1 = _mm_slli_epi16(psi_r_m1_p1, 1);
    psi_r_m1_m1 = _mm_mulhi_epi16(psi_r_m1_m1, ONE_OVER_SQRT_2);
    psi_r_m1_m1 = _mm_slli_epi16(psi_r_m1_m1, 1);
    psi_r_m1_m3 = _mm_mulhi_epi16(psi_r_m1_m3, ONE_OVER_SQRT_2);
    psi_r_m1_m3 = _mm_slli_epi16(psi_r_m1_m3, 1);
    psi_r_m1_m5 = _mm_mulhi_epi16(psi_r_m1_m5, ONE_OVER_SQRT_2);
    psi_r_m1_m5 = _mm_slli_epi16(psi_r_m1_m5, 1);
    psi_r_m1_m7 = _mm_mulhi_epi16(psi_r_m1_m7, ONE_OVER_SQRT_2);
    psi_r_m1_m7 = _mm_slli_epi16(psi_r_m1_m7, 1);
    psi_r_m3_p7 = _mm_mulhi_epi16(psi_r_m3_p7, ONE_OVER_SQRT_2);
    psi_r_m3_p7 = _mm_slli_epi16(psi_r_m3_p7, 1);
    psi_r_m3_p5 = _mm_mulhi_epi16(psi_r_m3_p5, ONE_OVER_SQRT_2);
    psi_r_m3_p5 = _mm_slli_epi16(psi_r_m3_p5, 1);
    psi_r_m3_p3 = _mm_mulhi_epi16(psi_r_m3_p3, ONE_OVER_SQRT_2);
    psi_r_m3_p3 = _mm_slli_epi16(psi_r_m3_p3, 1);
    psi_r_m3_p1 = _mm_mulhi_epi16(psi_r_m3_p1, ONE_OVER_SQRT_2);
    psi_r_m3_p1 = _mm_slli_epi16(psi_r_m3_p1, 1);
    psi_r_m3_m1 = _mm_mulhi_epi16(psi_r_m3_m1, ONE_OVER_SQRT_2);
    psi_r_m3_m1 = _mm_slli_epi16(psi_r_m3_m1, 1);
    psi_r_m3_m3 = _mm_mulhi_epi16(psi_r_m3_m3, ONE_OVER_SQRT_2);
    psi_r_m3_m3 = _mm_slli_epi16(psi_r_m3_m3, 1);
    psi_r_m3_m5 = _mm_mulhi_epi16(psi_r_m3_m5, ONE_OVER_SQRT_2);
    psi_r_m3_m5 = _mm_slli_epi16(psi_r_m3_m5, 1);
    psi_r_m3_m7 = _mm_mulhi_epi16(psi_r_m3_m7, ONE_OVER_SQRT_2);
    psi_r_m3_m7 = _mm_slli_epi16(psi_r_m3_m7, 1);
    psi_r_m5_p7 = _mm_mulhi_epi16(psi_r_m5_p7, ONE_OVER_SQRT_2);
    psi_r_m5_p7 = _mm_slli_epi16(psi_r_m5_p7, 1);
    psi_r_m5_p5 = _mm_mulhi_epi16(psi_r_m5_p5, ONE_OVER_SQRT_2);
    psi_r_m5_p5 = _mm_slli_epi16(psi_r_m5_p5, 1);
    psi_r_m5_p3 = _mm_mulhi_epi16(psi_r_m5_p3, ONE_OVER_SQRT_2);
    psi_r_m5_p3 = _mm_slli_epi16(psi_r_m5_p3, 1);
    psi_r_m5_p1 = _mm_mulhi_epi16(psi_r_m5_p1, ONE_OVER_SQRT_2);
    psi_r_m5_p1 = _mm_slli_epi16(psi_r_m5_p1, 1);
    psi_r_m5_m1 = _mm_mulhi_epi16(psi_r_m5_m1, ONE_OVER_SQRT_2);
    psi_r_m5_m1 = _mm_slli_epi16(psi_r_m5_m1, 1);
    psi_r_m5_m3 = _mm_mulhi_epi16(psi_r_m5_m3, ONE_OVER_SQRT_2);
    psi_r_m5_m3 = _mm_slli_epi16(psi_r_m5_m3, 1);
    psi_r_m5_m5 = _mm_mulhi_epi16(psi_r_m5_m5, ONE_OVER_SQRT_2);
    psi_r_m5_m5 = _mm_slli_epi16(psi_r_m5_m5, 1);
    psi_r_m5_m7 = _mm_mulhi_epi16(psi_r_m5_m7, ONE_OVER_SQRT_2);
    psi_r_m5_m7 = _mm_slli_epi16(psi_r_m5_m7, 1);
    psi_r_m7_p7 = _mm_mulhi_epi16(psi_r_m7_p7, ONE_OVER_SQRT_2);
    psi_r_m7_p7 = _mm_slli_epi16(psi_r_m7_p7, 1);
    psi_r_m7_p5 = _mm_mulhi_epi16(psi_r_m7_p5, ONE_OVER_SQRT_2);
    psi_r_m7_p5 = _mm_slli_epi16(psi_r_m7_p5, 1);
    psi_r_m7_p3 = _mm_mulhi_epi16(psi_r_m7_p3, ONE_OVER_SQRT_2);
    psi_r_m7_p3 = _mm_slli_epi16(psi_r_m7_p3, 1);
    psi_r_m7_p1 = _mm_mulhi_epi16(psi_r_m7_p1, ONE_OVER_SQRT_2);
    psi_r_m7_p1 = _mm_slli_epi16(psi_r_m7_p1, 1);
    psi_r_m7_m1 = _mm_mulhi_epi16(psi_r_m7_m1, ONE_OVER_SQRT_2);
    psi_r_m7_m1 = _mm_slli_epi16(psi_r_m7_m1, 1);
    psi_r_m7_m3 = _mm_mulhi_epi16(psi_r_m7_m3, ONE_OVER_SQRT_2);
    psi_r_m7_m3 = _mm_slli_epi16(psi_r_m7_m3, 1);
    psi_r_m7_m5 = _mm_mulhi_epi16(psi_r_m7_m5, ONE_OVER_SQRT_2);
    psi_r_m7_m5 = _mm_slli_epi16(psi_r_m7_m5, 1);
    psi_r_m7_m7 = _mm_mulhi_epi16(psi_r_m7_m7, ONE_OVER_SQRT_2);
    psi_r_m7_m7 = _mm_slli_epi16(psi_r_m7_m7, 1);

    psi_i_p7_p7 = _mm_mulhi_epi16(psi_i_p7_p7, ONE_OVER_SQRT_2);
    psi_i_p7_p7 = _mm_slli_epi16(psi_i_p7_p7, 1);
    psi_i_p7_p5 = _mm_mulhi_epi16(psi_i_p7_p5, ONE_OVER_SQRT_2);
    psi_i_p7_p5 = _mm_slli_epi16(psi_i_p7_p5, 1);
    psi_i_p7_p3 = _mm_mulhi_epi16(psi_i_p7_p3, ONE_OVER_SQRT_2);
    psi_i_p7_p3 = _mm_slli_epi16(psi_i_p7_p3, 1);
    psi_i_p7_p1 = _mm_mulhi_epi16(psi_i_p7_p1, ONE_OVER_SQRT_2);
    psi_i_p7_p1 = _mm_slli_epi16(psi_i_p7_p1, 1);
    psi_i_p7_m1 = _mm_mulhi_epi16(psi_i_p7_m1, ONE_OVER_SQRT_2);
    psi_i_p7_m1 = _mm_slli_epi16(psi_i_p7_m1, 1);
    psi_i_p7_m3 = _mm_mulhi_epi16(psi_i_p7_m3, ONE_OVER_SQRT_2);
    psi_i_p7_m3 = _mm_slli_epi16(psi_i_p7_m3, 1);
    psi_i_p7_m5 = _mm_mulhi_epi16(psi_i_p7_m5, ONE_OVER_SQRT_2);
    psi_i_p7_m5 = _mm_slli_epi16(psi_i_p7_m5, 1);
    psi_i_p7_m7 = _mm_mulhi_epi16(psi_i_p7_m7, ONE_OVER_SQRT_2);
    psi_i_p7_m7 = _mm_slli_epi16(psi_i_p7_m7, 1);
    psi_i_p5_p7 = _mm_mulhi_epi16(psi_i_p5_p7, ONE_OVER_SQRT_2);
    psi_i_p5_p7 = _mm_slli_epi16(psi_i_p5_p7, 1);
    psi_i_p5_p5 = _mm_mulhi_epi16(psi_i_p5_p5, ONE_OVER_SQRT_2);
    psi_i_p5_p5 = _mm_slli_epi16(psi_i_p5_p5, 1);
    psi_i_p5_p3 = _mm_mulhi_epi16(psi_i_p5_p3, ONE_OVER_SQRT_2);
    psi_i_p5_p3 = _mm_slli_epi16(psi_i_p5_p3, 1);
    psi_i_p5_p1 = _mm_mulhi_epi16(psi_i_p5_p1, ONE_OVER_SQRT_2);
    psi_i_p5_p1 = _mm_slli_epi16(psi_i_p5_p1, 1);
    psi_i_p5_m1 = _mm_mulhi_epi16(psi_i_p5_m1, ONE_OVER_SQRT_2);
    psi_i_p5_m1 = _mm_slli_epi16(psi_i_p5_m1, 1);
    psi_i_p5_m3 = _mm_mulhi_epi16(psi_i_p5_m3, ONE_OVER_SQRT_2);
    psi_i_p5_m3 = _mm_slli_epi16(psi_i_p5_m3, 1);
    psi_i_p5_m5 = _mm_mulhi_epi16(psi_i_p5_m5, ONE_OVER_SQRT_2);
    psi_i_p5_m5 = _mm_slli_epi16(psi_i_p5_m5, 1);
    psi_i_p5_m7 = _mm_mulhi_epi16(psi_i_p5_m7, ONE_OVER_SQRT_2);
    psi_i_p5_m7 = _mm_slli_epi16(psi_i_p5_m7, 1);
    psi_i_p3_p7 = _mm_mulhi_epi16(psi_i_p3_p7, ONE_OVER_SQRT_2);
    psi_i_p3_p7 = _mm_slli_epi16(psi_i_p3_p7, 1);
    psi_i_p3_p5 = _mm_mulhi_epi16(psi_i_p3_p5, ONE_OVER_SQRT_2);
    psi_i_p3_p5 = _mm_slli_epi16(psi_i_p3_p5, 1);
    psi_i_p3_p3 = _mm_mulhi_epi16(psi_i_p3_p3, ONE_OVER_SQRT_2);
    psi_i_p3_p3 = _mm_slli_epi16(psi_i_p3_p3, 1);
    psi_i_p3_p1 = _mm_mulhi_epi16(psi_i_p3_p1, ONE_OVER_SQRT_2);
    psi_i_p3_p1 = _mm_slli_epi16(psi_i_p3_p1, 1);
    psi_i_p3_m1 = _mm_mulhi_epi16(psi_i_p3_m1, ONE_OVER_SQRT_2);
    psi_i_p3_m1 = _mm_slli_epi16(psi_i_p3_m1, 1);
    psi_i_p3_m3 = _mm_mulhi_epi16(psi_i_p3_m3, ONE_OVER_SQRT_2);
    psi_i_p3_m3 = _mm_slli_epi16(psi_i_p3_m3, 1);
    psi_i_p3_m5 = _mm_mulhi_epi16(psi_i_p3_m5, ONE_OVER_SQRT_2);
    psi_i_p3_m5 = _mm_slli_epi16(psi_i_p3_m5, 1);
    psi_i_p3_m7 = _mm_mulhi_epi16(psi_i_p3_m7, ONE_OVER_SQRT_2);
    psi_i_p3_m7 = _mm_slli_epi16(psi_i_p3_m7, 1);
    psi_i_p1_p7 = _mm_mulhi_epi16(psi_i_p1_p7, ONE_OVER_SQRT_2);
    psi_i_p1_p7 = _mm_slli_epi16(psi_i_p1_p7, 1);
    psi_i_p1_p5 = _mm_mulhi_epi16(psi_i_p1_p5, ONE_OVER_SQRT_2);
    psi_i_p1_p5 = _mm_slli_epi16(psi_i_p1_p5, 1);
    psi_i_p1_p3 = _mm_mulhi_epi16(psi_i_p1_p3, ONE_OVER_SQRT_2);
    psi_i_p1_p3 = _mm_slli_epi16(psi_i_p1_p3, 1);
    psi_i_p1_p1 = _mm_mulhi_epi16(psi_i_p1_p1, ONE_OVER_SQRT_2);
    psi_i_p1_p1 = _mm_slli_epi16(psi_i_p1_p1, 1);
    psi_i_p1_m1 = _mm_mulhi_epi16(psi_i_p1_m1, ONE_OVER_SQRT_2);
    psi_i_p1_m1 = _mm_slli_epi16(psi_i_p1_m1, 1);
    psi_i_p1_m3 = _mm_mulhi_epi16(psi_i_p1_m3, ONE_OVER_SQRT_2);
    psi_i_p1_m3 = _mm_slli_epi16(psi_i_p1_m3, 1);
    psi_i_p1_m5 = _mm_mulhi_epi16(psi_i_p1_m5, ONE_OVER_SQRT_2);
    psi_i_p1_m5 = _mm_slli_epi16(psi_i_p1_m5, 1);
    psi_i_p1_m7 = _mm_mulhi_epi16(psi_i_p1_m7, ONE_OVER_SQRT_2);
    psi_i_p1_m7 = _mm_slli_epi16(psi_i_p1_m7, 1);
    psi_i_m1_p7 = _mm_mulhi_epi16(psi_i_m1_p7, ONE_OVER_SQRT_2);
    psi_i_m1_p7 = _mm_slli_epi16(psi_i_m1_p7, 1);
    psi_i_m1_p5 = _mm_mulhi_epi16(psi_i_m1_p5, ONE_OVER_SQRT_2);
    psi_i_m1_p5 = _mm_slli_epi16(psi_i_m1_p5, 1);
    psi_i_m1_p3 = _mm_mulhi_epi16(psi_i_m1_p3, ONE_OVER_SQRT_2);
    psi_i_m1_p3 = _mm_slli_epi16(psi_i_m1_p3, 1);
    psi_i_m1_p1 = _mm_mulhi_epi16(psi_i_m1_p1, ONE_OVER_SQRT_2);
    psi_i_m1_p1 = _mm_slli_epi16(psi_i_m1_p1, 1);
    psi_i_m1_m1 = _mm_mulhi_epi16(psi_i_m1_m1, ONE_OVER_SQRT_2);
    psi_i_m1_m1 = _mm_slli_epi16(psi_i_m1_m1, 1);
    psi_i_m1_m3 = _mm_mulhi_epi16(psi_i_m1_m3, ONE_OVER_SQRT_2);
    psi_i_m1_m3 = _mm_slli_epi16(psi_i_m1_m3, 1);
    psi_i_m1_m5 = _mm_mulhi_epi16(psi_i_m1_m5, ONE_OVER_SQRT_2);
    psi_i_m1_m5 = _mm_slli_epi16(psi_i_m1_m5, 1);
    psi_i_m1_m7 = _mm_mulhi_epi16(psi_i_m1_m7, ONE_OVER_SQRT_2);
    psi_i_m1_m7 = _mm_slli_epi16(psi_i_m1_m7, 1);
    psi_i_m3_p7 = _mm_mulhi_epi16(psi_i_m3_p7, ONE_OVER_SQRT_2);
    psi_i_m3_p7 = _mm_slli_epi16(psi_i_m3_p7, 1);
    psi_i_m3_p5 = _mm_mulhi_epi16(psi_i_m3_p5, ONE_OVER_SQRT_2);
    psi_i_m3_p5 = _mm_slli_epi16(psi_i_m3_p5, 1);
    psi_i_m3_p3 = _mm_mulhi_epi16(psi_i_m3_p3, ONE_OVER_SQRT_2);
    psi_i_m3_p3 = _mm_slli_epi16(psi_i_m3_p3, 1);
    psi_i_m3_p1 = _mm_mulhi_epi16(psi_i_m3_p1, ONE_OVER_SQRT_2);
    psi_i_m3_p1 = _mm_slli_epi16(psi_i_m3_p1, 1);
    psi_i_m3_m1 = _mm_mulhi_epi16(psi_i_m3_m1, ONE_OVER_SQRT_2);
    psi_i_m3_m1 = _mm_slli_epi16(psi_i_m3_m1, 1);
    psi_i_m3_m3 = _mm_mulhi_epi16(psi_i_m3_m3, ONE_OVER_SQRT_2);
    psi_i_m3_m3 = _mm_slli_epi16(psi_i_m3_m3, 1);
    psi_i_m3_m5 = _mm_mulhi_epi16(psi_i_m3_m5, ONE_OVER_SQRT_2);
    psi_i_m3_m5 = _mm_slli_epi16(psi_i_m3_m5, 1);
    psi_i_m3_m7 = _mm_mulhi_epi16(psi_i_m3_m7, ONE_OVER_SQRT_2);
    psi_i_m3_m7 = _mm_slli_epi16(psi_i_m3_m7, 1);
    psi_i_m5_p7 = _mm_mulhi_epi16(psi_i_m5_p7, ONE_OVER_SQRT_2);
    psi_i_m5_p7 = _mm_slli_epi16(psi_i_m5_p7, 1);
    psi_i_m5_p5 = _mm_mulhi_epi16(psi_i_m5_p5, ONE_OVER_SQRT_2);
    psi_i_m5_p5 = _mm_slli_epi16(psi_i_m5_p5, 1);
    psi_i_m5_p3 = _mm_mulhi_epi16(psi_i_m5_p3, ONE_OVER_SQRT_2);
    psi_i_m5_p3 = _mm_slli_epi16(psi_i_m5_p3, 1);
    psi_i_m5_p1 = _mm_mulhi_epi16(psi_i_m5_p1, ONE_OVER_SQRT_2);
    psi_i_m5_p1 = _mm_slli_epi16(psi_i_m5_p1, 1);
    psi_i_m5_m1 = _mm_mulhi_epi16(psi_i_m5_m1, ONE_OVER_SQRT_2);
    psi_i_m5_m1 = _mm_slli_epi16(psi_i_m5_m1, 1);
    psi_i_m5_m3 = _mm_mulhi_epi16(psi_i_m5_m3, ONE_OVER_SQRT_2);
    psi_i_m5_m3 = _mm_slli_epi16(psi_i_m5_m3, 1);
    psi_i_m5_m5 = _mm_mulhi_epi16(psi_i_m5_m5, ONE_OVER_SQRT_2);
    psi_i_m5_m5 = _mm_slli_epi16(psi_i_m5_m5, 1);
    psi_i_m5_m7 = _mm_mulhi_epi16(psi_i_m5_m7, ONE_OVER_SQRT_2);
    psi_i_m5_m7 = _mm_slli_epi16(psi_i_m5_m7, 1);
    psi_i_m7_p7 = _mm_mulhi_epi16(psi_i_m7_p7, ONE_OVER_SQRT_2);
    psi_i_m7_p7 = _mm_slli_epi16(psi_i_m7_p7, 1);
    psi_i_m7_p5 = _mm_mulhi_epi16(psi_i_m7_p5, ONE_OVER_SQRT_2);
    psi_i_m7_p5 = _mm_slli_epi16(psi_i_m7_p5, 1);
    psi_i_m7_p3 = _mm_mulhi_epi16(psi_i_m7_p3, ONE_OVER_SQRT_2);
    psi_i_m7_p3 = _mm_slli_epi16(psi_i_m7_p3, 1);
    psi_i_m7_p1 = _mm_mulhi_epi16(psi_i_m7_p1, ONE_OVER_SQRT_2);
    psi_i_m7_p1 = _mm_slli_epi16(psi_i_m7_p1, 1);
    psi_i_m7_m1 = _mm_mulhi_epi16(psi_i_m7_m1, ONE_OVER_SQRT_2);
    psi_i_m7_m1 = _mm_slli_epi16(psi_i_m7_m1, 1);
    psi_i_m7_m3 = _mm_mulhi_epi16(psi_i_m7_m3, ONE_OVER_SQRT_2);
    psi_i_m7_m3 = _mm_slli_epi16(psi_i_m7_m3, 1);
    psi_i_m7_m5 = _mm_mulhi_epi16(psi_i_m7_m5, ONE_OVER_SQRT_2);
    psi_i_m7_m5 = _mm_slli_epi16(psi_i_m7_m5, 1);
    psi_i_m7_m7 = _mm_mulhi_epi16(psi_i_m7_m7, ONE_OVER_SQRT_2);
    psi_i_m7_m7 = _mm_slli_epi16(psi_i_m7_m7, 1);

    psi_a_p7_p7 = _mm_adds_epi16(psi_r_p7_p7, psi_i_p7_p7);
    psi_a_p7_p5 = _mm_adds_epi16(psi_r_p7_p5, psi_i_p7_p5);
    psi_a_p7_p3 = _mm_adds_epi16(psi_r_p7_p3, psi_i_p7_p3);
    psi_a_p7_p1 = _mm_adds_epi16(psi_r_p7_p1, psi_i_p7_p1);
    psi_a_p7_m1 = _mm_adds_epi16(psi_r_p7_m1, psi_i_p7_m1);
    psi_a_p7_m3 = _mm_adds_epi16(psi_r_p7_m3, psi_i_p7_m3);
    psi_a_p7_m5 = _mm_adds_epi16(psi_r_p7_m5, psi_i_p7_m5);
    psi_a_p7_m7 = _mm_adds_epi16(psi_r_p7_m7, psi_i_p7_m7);
    psi_a_p5_p7 = _mm_adds_epi16(psi_r_p5_p7, psi_i_p5_p7);
    psi_a_p5_p5 = _mm_adds_epi16(psi_r_p5_p5, psi_i_p5_p5);
    psi_a_p5_p3 = _mm_adds_epi16(psi_r_p5_p3, psi_i_p5_p3);
    psi_a_p5_p1 = _mm_adds_epi16(psi_r_p5_p1, psi_i_p5_p1);
    psi_a_p5_m1 = _mm_adds_epi16(psi_r_p5_m1, psi_i_p5_m1);
    psi_a_p5_m3 = _mm_adds_epi16(psi_r_p5_m3, psi_i_p5_m3);
    psi_a_p5_m5 = _mm_adds_epi16(psi_r_p5_m5, psi_i_p5_m5);
    psi_a_p5_m7 = _mm_adds_epi16(psi_r_p5_m7, psi_i_p5_m7);
    psi_a_p3_p7 = _mm_adds_epi16(psi_r_p3_p7, psi_i_p3_p7);
    psi_a_p3_p5 = _mm_adds_epi16(psi_r_p3_p5, psi_i_p3_p5);
    psi_a_p3_p3 = _mm_adds_epi16(psi_r_p3_p3, psi_i_p3_p3);
    psi_a_p3_p1 = _mm_adds_epi16(psi_r_p3_p1, psi_i_p3_p1);
    psi_a_p3_m1 = _mm_adds_epi16(psi_r_p3_m1, psi_i_p3_m1);
    psi_a_p3_m3 = _mm_adds_epi16(psi_r_p3_m3, psi_i_p3_m3);
    psi_a_p3_m5 = _mm_adds_epi16(psi_r_p3_m5, psi_i_p3_m5);
    psi_a_p3_m7 = _mm_adds_epi16(psi_r_p3_m7, psi_i_p3_m7);
    psi_a_p1_p7 = _mm_adds_epi16(psi_r_p1_p7, psi_i_p1_p7);
    psi_a_p1_p5 = _mm_adds_epi16(psi_r_p1_p5, psi_i_p1_p5);
    psi_a_p1_p3 = _mm_adds_epi16(psi_r_p1_p3, psi_i_p1_p3);
    psi_a_p1_p1 = _mm_adds_epi16(psi_r_p1_p1, psi_i_p1_p1);
    psi_a_p1_m1 = _mm_adds_epi16(psi_r_p1_m1, psi_i_p1_m1);
    psi_a_p1_m3 = _mm_adds_epi16(psi_r_p1_m3, psi_i_p1_m3);
    psi_a_p1_m5 = _mm_adds_epi16(psi_r_p1_m5, psi_i_p1_m5);
    psi_a_p1_m7 = _mm_adds_epi16(psi_r_p1_m7, psi_i_p1_m7);
    psi_a_m1_p7 = _mm_adds_epi16(psi_r_m1_p7, psi_i_m1_p7);
    psi_a_m1_p5 = _mm_adds_epi16(psi_r_m1_p5, psi_i_m1_p5);
    psi_a_m1_p3 = _mm_adds_epi16(psi_r_m1_p3, psi_i_m1_p3);
    psi_a_m1_p1 = _mm_adds_epi16(psi_r_m1_p1, psi_i_m1_p1);
    psi_a_m1_m1 = _mm_adds_epi16(psi_r_m1_m1, psi_i_m1_m1);
    psi_a_m1_m3 = _mm_adds_epi16(psi_r_m1_m3, psi_i_m1_m3);
    psi_a_m1_m5 = _mm_adds_epi16(psi_r_m1_m5, psi_i_m1_m5);
    psi_a_m1_m7 = _mm_adds_epi16(psi_r_m1_m7, psi_i_m1_m7);
    psi_a_m3_p7 = _mm_adds_epi16(psi_r_m3_p7, psi_i_m3_p7);
    psi_a_m3_p5 = _mm_adds_epi16(psi_r_m3_p5, psi_i_m3_p5);
    psi_a_m3_p3 = _mm_adds_epi16(psi_r_m3_p3, psi_i_m3_p3);
    psi_a_m3_p1 = _mm_adds_epi16(psi_r_m3_p1, psi_i_m3_p1);
    psi_a_m3_m1 = _mm_adds_epi16(psi_r_m3_m1, psi_i_m3_m1);
    psi_a_m3_m3 = _mm_adds_epi16(psi_r_m3_m3, psi_i_m3_m3);
    psi_a_m3_m5 = _mm_adds_epi16(psi_r_m3_m5, psi_i_m3_m5);
    psi_a_m3_m7 = _mm_adds_epi16(psi_r_m3_m7, psi_i_m3_m7);
    psi_a_m5_p7 = _mm_adds_epi16(psi_r_m5_p7, psi_i_m5_p7);
    psi_a_m5_p5 = _mm_adds_epi16(psi_r_m5_p5, psi_i_m5_p5);
    psi_a_m5_p3 = _mm_adds_epi16(psi_r_m5_p3, psi_i_m5_p3);
    psi_a_m5_p1 = _mm_adds_epi16(psi_r_m5_p1, psi_i_m5_p1);
    psi_a_m5_m1 = _mm_adds_epi16(psi_r_m5_m1, psi_i_m5_m1);
    psi_a_m5_m3 = _mm_adds_epi16(psi_r_m5_m3, psi_i_m5_m3);
    psi_a_m5_m5 = _mm_adds_epi16(psi_r_m5_m5, psi_i_m5_m5);
    psi_a_m5_m7 = _mm_adds_epi16(psi_r_m5_m7, psi_i_m5_m7);
    psi_a_m7_p7 = _mm_adds_epi16(psi_r_m7_p7, psi_i_m7_p7);
    psi_a_m7_p5 = _mm_adds_epi16(psi_r_m7_p5, psi_i_m7_p5);
    psi_a_m7_p3 = _mm_adds_epi16(psi_r_m7_p3, psi_i_m7_p3);
    psi_a_m7_p1 = _mm_adds_epi16(psi_r_m7_p1, psi_i_m7_p1);
    psi_a_m7_m1 = _mm_adds_epi16(psi_r_m7_m1, psi_i_m7_m1);
    psi_a_m7_m3 = _mm_adds_epi16(psi_r_m7_m3, psi_i_m7_m3);
    psi_a_m7_m5 = _mm_adds_epi16(psi_r_m7_m5, psi_i_m7_m5);
    psi_a_m7_m7 = _mm_adds_epi16(psi_r_m7_m7, psi_i_m7_m7);

    // Computing different multiples of ||h0||^2
    // x=1, y=1
    ch_mag_2_over_42_with_sigma2 = _mm_mulhi_epi16(ch_mag_des,ONE_OVER_FOUR_SQRT_42);
    ch_mag_2_over_42_with_sigma2 = _mm_slli_epi16(ch_mag_2_over_42_with_sigma2,1);
    // x=1, y=3
    ch_mag_10_over_42_with_sigma2 = _mm_mulhi_epi16(ch_mag_des,FIVE_OVER_FOUR_SQRT_42);
    ch_mag_10_over_42_with_sigma2 = _mm_slli_epi16(ch_mag_10_over_42_with_sigma2,1);
    // x=1, x=5
    ch_mag_26_over_42_with_sigma2 = _mm_mulhi_epi16(ch_mag_des,THIRTEEN_OVER_FOUR_SQRT_42);
    ch_mag_26_over_42_with_sigma2 = _mm_slli_epi16(ch_mag_26_over_42_with_sigma2,1);
    // x=1, y=7
    ch_mag_50_over_42_with_sigma2 = _mm_mulhi_epi16(ch_mag_des,TWENTYFIVE_OVER_FOUR_SQRT_42);
    ch_mag_50_over_42_with_sigma2 = _mm_slli_epi16(ch_mag_50_over_42_with_sigma2,1);
    // x=3, y=3
    ch_mag_18_over_42_with_sigma2 = _mm_mulhi_epi16(ch_mag_des,NINE_OVER_FOUR_SQRT_42);
    ch_mag_18_over_42_with_sigma2 = _mm_slli_epi16(ch_mag_18_over_42_with_sigma2,1);
    // x=3, y=5
    ch_mag_34_over_42_with_sigma2 = _mm_mulhi_epi16(ch_mag_des,SEVENTEEN_OVER_FOUR_SQRT_42);
    ch_mag_34_over_42_with_sigma2 = _mm_slli_epi16(ch_mag_34_over_42_with_sigma2,1);
    // x=3, y=7
    ch_mag_58_over_42_with_sigma2 = _mm_mulhi_epi16(ch_mag_des,TWENTYNINE_OVER_FOUR_SQRT_42);
    ch_mag_58_over_42_with_sigma2 = _mm_slli_epi16(ch_mag_58_over_42_with_sigma2,2);
    // x=5, y=5
    ch_mag_50_over_42_with_sigma2 = _mm_mulhi_epi16(ch_mag_des,TWENTYFIVE_OVER_FOUR_SQRT_42);
    ch_mag_50_over_42_with_sigma2 = _mm_slli_epi16(ch_mag_50_over_42_with_sigma2,1);
    // x=5, y=7
    ch_mag_74_over_42_with_sigma2 = _mm_mulhi_epi16(ch_mag_des,THIRTYSEVEN_OVER_FOUR_SQRT_42);
    ch_mag_74_over_42_with_sigma2 = _mm_slli_epi16(ch_mag_74_over_42_with_sigma2,2);
    // x=7, y=7
    ch_mag_98_over_42_with_sigma2 = _mm_mulhi_epi16(ch_mag_des,FORTYNINE_OVER_FOUR_SQRT_42);
    ch_mag_98_over_42_with_sigma2 = _mm_slli_epi16(ch_mag_98_over_42_with_sigma2,2);

    // Computing Metrics
    xmm1 = _mm_adds_epi16(psi_a_p7_p7, y0_p_7_7);
    bit_met_p7_p7 = _mm_subs_epi16(xmm1, ch_mag_98_over_42_with_sigma2);
    xmm1 = _mm_adds_epi16(psi_a_p7_p5, y0_p_7_5);
    bit_met_p7_p5 = _mm_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm1 = _mm_adds_epi16(psi_a_p7_p3, y0_p_7_3);
    bit_met_p7_p3 = _mm_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm1 = _mm_adds_epi16(psi_a_p7_p1, y0_p_7_1);
    bit_met_p7_p1 = _mm_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm1 = _mm_adds_epi16(psi_a_p7_m1, y0_m_7_1);
    bit_met_p7_m1 = _mm_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm1 = _mm_adds_epi16(psi_a_p7_m3, y0_m_7_3);
    bit_met_p7_m3 = _mm_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm1 = _mm_adds_epi16(psi_a_p7_m5, y0_m_7_5);
    bit_met_p7_m5 = _mm_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm1 = _mm_adds_epi16(psi_a_p7_m7, y0_m_7_7);
    bit_met_p7_m7 = _mm_subs_epi16(xmm1, ch_mag_98_over_42_with_sigma2);
    xmm1 = _mm_adds_epi16(psi_a_p5_p7, y0_p_5_7);
    bit_met_p5_p7 = _mm_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm1 = _mm_adds_epi16(psi_a_p5_p5, y0_p_5_5);
    bit_met_p5_p5 = _mm_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm1 = _mm_adds_epi16(psi_a_p5_p3, y0_p_5_3);
    bit_met_p5_p3 = _mm_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm1 = _mm_adds_epi16(psi_a_p5_p1, y0_p_5_1);
    bit_met_p5_p1 = _mm_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm1 = _mm_adds_epi16(psi_a_p5_m1, y0_m_5_1);
    bit_met_p5_m1 = _mm_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm1 = _mm_adds_epi16(psi_a_p5_m3, y0_m_5_3);
    bit_met_p5_m3 = _mm_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm1 = _mm_adds_epi16(psi_a_p5_m5, y0_m_5_5);
    bit_met_p5_m5 = _mm_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm1 = _mm_adds_epi16(psi_a_p5_m7, y0_m_5_7);
    bit_met_p5_m7 = _mm_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm1 = _mm_adds_epi16(psi_a_p3_p7, y0_p_3_7);
    bit_met_p3_p7 = _mm_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm1 = _mm_adds_epi16(psi_a_p3_p5, y0_p_3_5);
    bit_met_p3_p5 = _mm_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm1 = _mm_adds_epi16(psi_a_p3_p3, y0_p_3_3);
    bit_met_p3_p3 = _mm_subs_epi16(xmm1, ch_mag_18_over_42_with_sigma2);
    xmm1 = _mm_adds_epi16(psi_a_p3_p1, y0_p_3_1);
    bit_met_p3_p1 = _mm_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm1 = _mm_adds_epi16(psi_a_p3_m1, y0_m_3_1);
    bit_met_p3_m1 = _mm_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm1 = _mm_adds_epi16(psi_a_p3_m3, y0_m_3_3);
    bit_met_p3_m3 = _mm_subs_epi16(xmm1, ch_mag_18_over_42_with_sigma2);
    xmm1 = _mm_adds_epi16(psi_a_p3_m5, y0_m_3_5);
    bit_met_p3_m5 = _mm_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm1 = _mm_adds_epi16(psi_a_p3_m7, y0_m_3_7);
    bit_met_p3_m7 = _mm_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm1 = _mm_adds_epi16(psi_a_p1_p7, y0_p_1_7);
    bit_met_p1_p7 = _mm_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm1 = _mm_adds_epi16(psi_a_p1_p5, y0_p_1_5);
    bit_met_p1_p5 = _mm_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm1 = _mm_adds_epi16(psi_a_p1_p3, y0_p_1_3);
    bit_met_p1_p3 = _mm_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm1 = _mm_adds_epi16(psi_a_p1_p1, y0_p_1_1);
    bit_met_p1_p1 = _mm_subs_epi16(xmm1, ch_mag_2_over_42_with_sigma2);
    xmm1 = _mm_adds_epi16(psi_a_p1_m1, y0_m_1_1);
    bit_met_p1_m1 = _mm_subs_epi16(xmm1, ch_mag_2_over_42_with_sigma2);
    xmm1 = _mm_adds_epi16(psi_a_p1_m3, y0_m_1_3);
    bit_met_p1_m3 = _mm_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm1 = _mm_adds_epi16(psi_a_p1_m5, y0_m_1_5);
    bit_met_p1_m5 = _mm_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm1 = _mm_adds_epi16(psi_a_p1_m7, y0_m_1_7);
    bit_met_p1_m7 = _mm_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);

    xmm1 = _mm_subs_epi16(psi_a_m1_p7, y0_m_1_7);
    bit_met_m1_p7 = _mm_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm1 = _mm_subs_epi16(psi_a_m1_p5, y0_m_1_5);
    bit_met_m1_p5 = _mm_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm1 = _mm_subs_epi16(psi_a_m1_p3, y0_m_1_3);
    bit_met_m1_p3 = _mm_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm1 = _mm_subs_epi16(psi_a_m1_p1, y0_m_1_1);
    bit_met_m1_p1 = _mm_subs_epi16(xmm1, ch_mag_2_over_42_with_sigma2);
    xmm1 = _mm_subs_epi16(psi_a_m1_m1, y0_p_1_1);
    bit_met_m1_m1 = _mm_subs_epi16(xmm1, ch_mag_2_over_42_with_sigma2);
    xmm1 = _mm_subs_epi16(psi_a_m1_m3, y0_p_1_3);
    bit_met_m1_m3 = _mm_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm1 = _mm_subs_epi16(psi_a_m1_m5, y0_p_1_5);
    bit_met_m1_m5 = _mm_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm1 = _mm_subs_epi16(psi_a_m1_m7, y0_p_1_7);
    bit_met_m1_m7 = _mm_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm1 = _mm_subs_epi16(psi_a_m3_p7, y0_m_3_7);
    bit_met_m3_p7 = _mm_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm1 = _mm_subs_epi16(psi_a_m3_p5, y0_m_3_5);
    bit_met_m3_p5 = _mm_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm1 = _mm_subs_epi16(psi_a_m3_p3, y0_m_3_3);
    bit_met_m3_p3 = _mm_subs_epi16(xmm1, ch_mag_18_over_42_with_sigma2);
    xmm1 = _mm_subs_epi16(psi_a_m3_p1, y0_m_3_1);
    bit_met_m3_p1 = _mm_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm1 = _mm_subs_epi16(psi_a_m3_m1, y0_p_3_1);
    bit_met_m3_m1 = _mm_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm1 = _mm_subs_epi16(psi_a_m3_m3, y0_p_3_3);
    bit_met_m3_m3 = _mm_subs_epi16(xmm1, ch_mag_18_over_42_with_sigma2);
    xmm1 = _mm_subs_epi16(psi_a_m3_m5, y0_p_3_5);
    bit_met_m3_m5 = _mm_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm1 = _mm_subs_epi16(psi_a_m3_m7, y0_p_3_7);
    bit_met_m3_m7 = _mm_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm1 = _mm_subs_epi16(psi_a_m5_p7, y0_m_5_7);
    bit_met_m5_p7 = _mm_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm1 = _mm_subs_epi16(psi_a_m5_p5, y0_m_5_5);
    bit_met_m5_p5 = _mm_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm1 = _mm_subs_epi16(psi_a_m5_p3, y0_m_5_3);
    bit_met_m5_p3 = _mm_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm1 = _mm_subs_epi16(psi_a_m5_p1, y0_m_5_1);
    bit_met_m5_p1 = _mm_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm1 = _mm_subs_epi16(psi_a_m5_m1, y0_p_5_1);
    bit_met_m5_m1 = _mm_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm1 = _mm_subs_epi16(psi_a_m5_m3, y0_p_5_3);
    bit_met_m5_m3 = _mm_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm1 = _mm_subs_epi16(psi_a_m5_m5, y0_p_5_5);
    bit_met_m5_m5 = _mm_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm1 = _mm_subs_epi16(psi_a_m5_m7, y0_p_5_7);
    bit_met_m5_m7 = _mm_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm1 = _mm_subs_epi16(psi_a_m7_p7, y0_m_7_7);
    bit_met_m7_p7 = _mm_subs_epi16(xmm1, ch_mag_98_over_42_with_sigma2);
    xmm1 = _mm_subs_epi16(psi_a_m7_p5, y0_m_7_5);
    bit_met_m7_p5 = _mm_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm1 = _mm_subs_epi16(psi_a_m7_p3, y0_m_7_3);
    bit_met_m7_p3 = _mm_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm1 = _mm_subs_epi16(psi_a_m7_p1, y0_m_7_1);
    bit_met_m7_p1 = _mm_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm1 = _mm_subs_epi16(psi_a_m7_m1, y0_p_7_1);
    bit_met_m7_m1 = _mm_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm1 = _mm_subs_epi16(psi_a_m7_m3, y0_p_7_3);
    bit_met_m7_m3 = _mm_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm1 = _mm_subs_epi16(psi_a_m7_m5, y0_p_7_5);
    bit_met_m7_m5 = _mm_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm1 = _mm_subs_epi16(psi_a_m7_m7, y0_p_7_7);
    bit_met_m7_m7 = _mm_subs_epi16(xmm1, ch_mag_98_over_42_with_sigma2);

    // Detection for 1st bit (LTE mapping)
    // bit = 1
    xmm0 = _mm_max_epi16(bit_met_m7_p7, bit_met_m7_p5);
    xmm1 = _mm_max_epi16(bit_met_m7_p3, bit_met_m7_p1);
    xmm2 = _mm_max_epi16(bit_met_m7_m1, bit_met_m7_m3);
    xmm3 = _mm_max_epi16(bit_met_m7_m5, bit_met_m7_m7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(xmm4, xmm5);
    xmm0 = _mm_max_epi16(bit_met_m5_p7, bit_met_m5_p5);
    xmm1 = _mm_max_epi16(bit_met_m5_p3, bit_met_m5_p1);
    xmm2 = _mm_max_epi16(bit_met_m5_m1, bit_met_m5_m3);
    xmm3 = _mm_max_epi16(bit_met_m5_m5, bit_met_m5_m7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_m3_p7, bit_met_m3_p5);
    xmm1 = _mm_max_epi16(bit_met_m3_p3, bit_met_m3_p1);
    xmm2 = _mm_max_epi16(bit_met_m3_m1, bit_met_m3_m3);
    xmm3 = _mm_max_epi16(bit_met_m3_m5, bit_met_m3_m7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_m1_p7, bit_met_m1_p5);
    xmm1 = _mm_max_epi16(bit_met_m1_p3, bit_met_m1_p1);
    xmm2 = _mm_max_epi16(bit_met_m1_m1, bit_met_m1_m3);
    xmm3 = _mm_max_epi16(bit_met_m1_m5, bit_met_m1_m7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);

    // bit = 0
    xmm0 = _mm_max_epi16(bit_met_p7_p7, bit_met_p7_p5);
    xmm1 = _mm_max_epi16(bit_met_p7_p3, bit_met_p7_p1);
    xmm2 = _mm_max_epi16(bit_met_p7_m1, bit_met_p7_m3);
    xmm3 = _mm_max_epi16(bit_met_p7_m5, bit_met_p7_m7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(xmm4, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p5_p7, bit_met_p5_p5);
    xmm1 = _mm_max_epi16(bit_met_p5_p3, bit_met_p5_p1);
    xmm2 = _mm_max_epi16(bit_met_p5_m1, bit_met_p5_m3);
    xmm3 = _mm_max_epi16(bit_met_p5_m5, bit_met_p5_m7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p3_p7, bit_met_p3_p5);
    xmm1 = _mm_max_epi16(bit_met_p3_p3, bit_met_p3_p1);
    xmm2 = _mm_max_epi16(bit_met_p3_m1, bit_met_p3_m3);
    xmm3 = _mm_max_epi16(bit_met_p3_m5, bit_met_p3_m7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p1_p7, bit_met_p1_p5);
    xmm1 = _mm_max_epi16(bit_met_p1_p3, bit_met_p1_p1);
    xmm2 = _mm_max_epi16(bit_met_p1_m1, bit_met_p1_m3);
    xmm3 = _mm_max_epi16(bit_met_p1_m5, bit_met_p1_m7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);

    y0r = _mm_subs_epi16(logmax_num_re0, logmax_den_re0);

    // Detection for 2nd bit (LTE mapping)
    // bit = 1
    xmm0 = _mm_max_epi16(bit_met_p7_m1, bit_met_p5_m1);
    xmm1 = _mm_max_epi16(bit_met_p3_m1, bit_met_p1_m1);
    xmm2 = _mm_max_epi16(bit_met_m1_m1, bit_met_m3_m1);
    xmm3 = _mm_max_epi16(bit_met_m5_m1, bit_met_m7_m1);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(xmm4, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_m3, bit_met_p5_m3);
    xmm1 = _mm_max_epi16(bit_met_p3_m3, bit_met_p1_m3);
    xmm2 = _mm_max_epi16(bit_met_m1_m3, bit_met_m3_m3);
    xmm3 = _mm_max_epi16(bit_met_m5_m3, bit_met_m7_m3);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_m5, bit_met_p5_m5);
    xmm1 = _mm_max_epi16(bit_met_p3_m5, bit_met_p1_m5);
    xmm2 = _mm_max_epi16(bit_met_m1_m5, bit_met_m3_m5);
    xmm3 = _mm_max_epi16(bit_met_m5_m5, bit_met_m7_m5);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_m7, bit_met_p5_m7);
    xmm1 = _mm_max_epi16(bit_met_p3_m7, bit_met_p1_m7);
    xmm2 = _mm_max_epi16(bit_met_m1_m7, bit_met_m3_m7);
    xmm3 = _mm_max_epi16(bit_met_m5_m7, bit_met_m7_m7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);

    // bit = 0
    xmm0 = _mm_max_epi16(bit_met_p7_p1, bit_met_p5_p1);
    xmm1 = _mm_max_epi16(bit_met_p3_p1, bit_met_p1_p1);
    xmm2 = _mm_max_epi16(bit_met_m1_p1, bit_met_m3_p1);
    xmm3 = _mm_max_epi16(bit_met_m5_p1, bit_met_m7_p1);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(xmm4, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_p3, bit_met_p5_p3);
    xmm1 = _mm_max_epi16(bit_met_p3_p3, bit_met_p1_p3);
    xmm2 = _mm_max_epi16(bit_met_m1_p3, bit_met_m3_p3);
    xmm3 = _mm_max_epi16(bit_met_m5_p3, bit_met_m7_p3);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_p5, bit_met_p5_p5);
    xmm1 = _mm_max_epi16(bit_met_p3_p5, bit_met_p1_p5);
    xmm2 = _mm_max_epi16(bit_met_m1_p5, bit_met_m3_p5);
    xmm3 = _mm_max_epi16(bit_met_m5_p5, bit_met_m7_p5);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_p7, bit_met_p5_p7);
    xmm1 = _mm_max_epi16(bit_met_p3_p7, bit_met_p1_p7);
    xmm2 = _mm_max_epi16(bit_met_m1_p7, bit_met_m3_p7);
    xmm3 = _mm_max_epi16(bit_met_m5_p7, bit_met_m7_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);

    y1r = _mm_subs_epi16(logmax_num_re0, logmax_den_re0);

    // Detection for 3rd bit (LTE mapping)
    xmm0 = _mm_max_epi16(bit_met_m7_m7, bit_met_m7_m5);
    xmm1 = _mm_max_epi16(bit_met_m7_m3, bit_met_m7_m1);
    xmm2 = _mm_max_epi16(bit_met_m7_p1, bit_met_m7_p3);
    xmm3 = _mm_max_epi16(bit_met_m7_p5, bit_met_m7_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(xmm4, xmm5);
    xmm0 = _mm_max_epi16(bit_met_m5_m7, bit_met_m5_m5);
    xmm1 = _mm_max_epi16(bit_met_m5_m3, bit_met_m5_m1);
    xmm2 = _mm_max_epi16(bit_met_m5_p1, bit_met_m5_p3);
    xmm3 = _mm_max_epi16(bit_met_m5_p5, bit_met_m5_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p5_m7, bit_met_p5_m5);
    xmm1 = _mm_max_epi16(bit_met_p5_m3, bit_met_p5_m1);
    xmm2 = _mm_max_epi16(bit_met_p5_p1, bit_met_p5_p3);
    xmm3 = _mm_max_epi16(bit_met_p5_p5, bit_met_p5_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_m7, bit_met_p7_m5);
    xmm1 = _mm_max_epi16(bit_met_p7_m3, bit_met_p7_m1);
    xmm2 = _mm_max_epi16(bit_met_p7_p1, bit_met_p7_p3);
    xmm3 = _mm_max_epi16(bit_met_p7_p5, bit_met_p7_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);

    xmm0 = _mm_max_epi16(bit_met_m3_m7, bit_met_m3_m5);
    xmm1 = _mm_max_epi16(bit_met_m3_m3, bit_met_m3_m1);
    xmm2 = _mm_max_epi16(bit_met_m3_p1, bit_met_m3_p3);
    xmm3 = _mm_max_epi16(bit_met_m3_p5, bit_met_m3_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(xmm4, xmm5);
    xmm0 = _mm_max_epi16(bit_met_m1_m7, bit_met_m1_m5);
    xmm1 = _mm_max_epi16(bit_met_m1_m3, bit_met_m1_m1);
    xmm2 = _mm_max_epi16(bit_met_m1_p1, bit_met_m1_p3);
    xmm3 = _mm_max_epi16(bit_met_m1_p5, bit_met_m1_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p1_m7, bit_met_p1_m5);
    xmm1 = _mm_max_epi16(bit_met_p1_m3, bit_met_p1_m1);
    xmm2 = _mm_max_epi16(bit_met_p1_p1, bit_met_p1_p3);
    xmm3 = _mm_max_epi16(bit_met_p1_p5, bit_met_p1_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p3_m7, bit_met_p3_m5);
    xmm1 = _mm_max_epi16(bit_met_p3_m3, bit_met_p3_m1);
    xmm2 = _mm_max_epi16(bit_met_p3_p1, bit_met_p3_p3);
    xmm3 = _mm_max_epi16(bit_met_p3_p5, bit_met_p3_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);

    y2r = _mm_subs_epi16(logmax_num_re0, logmax_den_re0);

    // Detection for 4th bit (LTE mapping)
    xmm0 = _mm_max_epi16(bit_met_p7_p7, bit_met_p5_p7);
    xmm1 = _mm_max_epi16(bit_met_p3_p7, bit_met_p1_p7);
    xmm2 = _mm_max_epi16(bit_met_m1_p7, bit_met_m3_p7);
    xmm3 = _mm_max_epi16(bit_met_m5_p7, bit_met_m7_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(xmm4, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_p5, bit_met_p5_p5);
    xmm1 = _mm_max_epi16(bit_met_p3_p5, bit_met_p1_p5);
    xmm2 = _mm_max_epi16(bit_met_m1_p5, bit_met_m3_p5);
    xmm3 = _mm_max_epi16(bit_met_m5_p5, bit_met_m5_p5);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_m5, bit_met_p5_m5);
    xmm1 = _mm_max_epi16(bit_met_p3_m5, bit_met_p1_m5);
    xmm2 = _mm_max_epi16(bit_met_m1_m5, bit_met_m3_m5);
    xmm3 = _mm_max_epi16(bit_met_m5_m5, bit_met_m7_m5);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_m7, bit_met_p5_m7);
    xmm1 = _mm_max_epi16(bit_met_p3_m7, bit_met_p1_m7);
    xmm2 = _mm_max_epi16(bit_met_m1_m7, bit_met_m3_m7);
    xmm3 = _mm_max_epi16(bit_met_m5_m7, bit_met_m7_m7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);

    xmm0 = _mm_max_epi16(bit_met_p7_m1, bit_met_p5_m1);
    xmm1 = _mm_max_epi16(bit_met_p3_m1, bit_met_p1_m1);
    xmm2 = _mm_max_epi16(bit_met_m1_m1, bit_met_m3_m1);
    xmm3 = _mm_max_epi16(bit_met_m5_m1, bit_met_m7_m1);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(xmm4, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_m3, bit_met_p5_m3);
    xmm1 = _mm_max_epi16(bit_met_p3_m3, bit_met_p1_m3);
    xmm2 = _mm_max_epi16(bit_met_m1_m3, bit_met_m3_m3);
    xmm3 = _mm_max_epi16(bit_met_m5_m3, bit_met_m7_m3);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_p1, bit_met_p5_p1);
    xmm1 = _mm_max_epi16(bit_met_p3_p1, bit_met_p1_p1);
    xmm2 = _mm_max_epi16(bit_met_m1_p1, bit_met_m3_p1);
    xmm3 = _mm_max_epi16(bit_met_m5_p1, bit_met_m7_p1);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_p3, bit_met_p5_p3);
    xmm1 = _mm_max_epi16(bit_met_p3_p3, bit_met_p1_p3);
    xmm2 = _mm_max_epi16(bit_met_m1_p3, bit_met_m3_p3);
    xmm3 = _mm_max_epi16(bit_met_m5_p3, bit_met_m7_p3);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);

    y0i = _mm_subs_epi16(logmax_num_re0, logmax_den_re0);


    // Detection for 5th bit (LTE mapping)
    xmm0 = _mm_max_epi16(bit_met_m7_m7, bit_met_m7_m5);
    xmm1 = _mm_max_epi16(bit_met_m7_m3, bit_met_m7_m1);
    xmm2 = _mm_max_epi16(bit_met_m7_p1, bit_met_m7_p3);
    xmm3 = _mm_max_epi16(bit_met_m7_p5, bit_met_m7_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(xmm4, xmm5);
    xmm0 = _mm_max_epi16(bit_met_m1_m7, bit_met_m1_m5);
    xmm1 = _mm_max_epi16(bit_met_m1_m3, bit_met_m1_m1);
    xmm2 = _mm_max_epi16(bit_met_m1_p1, bit_met_m1_p3);
    xmm3 = _mm_max_epi16(bit_met_m1_p5, bit_met_m1_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p1_m7, bit_met_p1_m5);
    xmm1 = _mm_max_epi16(bit_met_p1_m3, bit_met_p1_m1);
    xmm2 = _mm_max_epi16(bit_met_p1_p1, bit_met_p1_p3);
    xmm3 = _mm_max_epi16(bit_met_p1_p5, bit_met_p1_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_m7, bit_met_p7_m5);
    xmm1 = _mm_max_epi16(bit_met_p7_m3, bit_met_p7_m1);
    xmm2 = _mm_max_epi16(bit_met_p7_p1, bit_met_p7_p3);
    xmm3 = _mm_max_epi16(bit_met_p7_p5, bit_met_p7_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);

    xmm0 = _mm_max_epi16(bit_met_m5_m7, bit_met_m5_m5);
    xmm1 = _mm_max_epi16(bit_met_m5_m3, bit_met_m5_m1);
    xmm2 = _mm_max_epi16(bit_met_m5_p1, bit_met_m5_p3);
    xmm3 = _mm_max_epi16(bit_met_m5_p5, bit_met_m5_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(xmm4, xmm5);
    xmm0 = _mm_max_epi16(bit_met_m3_m7, bit_met_m3_m5);
    xmm1 = _mm_max_epi16(bit_met_m3_m3, bit_met_m3_m1);
    xmm2 = _mm_max_epi16(bit_met_m3_p1, bit_met_m3_p3);
    xmm3 = _mm_max_epi16(bit_met_m3_p5, bit_met_m3_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p3_m7, bit_met_p3_m5);
    xmm1 = _mm_max_epi16(bit_met_p3_m3, bit_met_p3_m1);
    xmm2 = _mm_max_epi16(bit_met_p3_p1, bit_met_p3_p3);
    xmm3 = _mm_max_epi16(bit_met_p3_p5, bit_met_p3_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p5_m7, bit_met_p5_m5);
    xmm1 = _mm_max_epi16(bit_met_p5_m3, bit_met_p5_m1);
    xmm2 = _mm_max_epi16(bit_met_p5_p1, bit_met_p5_p3);
    xmm3 = _mm_max_epi16(bit_met_p5_p5, bit_met_p5_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);

    y1i = _mm_subs_epi16(logmax_num_re0, logmax_den_re0);

    // Detection for 6th bit (LTE mapping)
    xmm0 = _mm_max_epi16(bit_met_p7_p7, bit_met_p5_p7);
    xmm1 = _mm_max_epi16(bit_met_p3_p7, bit_met_p1_p7);
    xmm2 = _mm_max_epi16(bit_met_m1_p7, bit_met_m3_p7);
    xmm3 = _mm_max_epi16(bit_met_m5_p7, bit_met_m7_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(xmm4, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_p1, bit_met_p5_p1);
    xmm1 = _mm_max_epi16(bit_met_p3_p1, bit_met_p1_p1);
    xmm2 = _mm_max_epi16(bit_met_m1_p1, bit_met_m3_p1);
    xmm3 = _mm_max_epi16(bit_met_m5_p1, bit_met_m5_p1);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_m1, bit_met_p5_m1);
    xmm1 = _mm_max_epi16(bit_met_p3_m1, bit_met_p1_m1);
    xmm2 = _mm_max_epi16(bit_met_m1_m1, bit_met_m3_m1);
    xmm3 = _mm_max_epi16(bit_met_m5_m1, bit_met_m7_m1);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_m7, bit_met_p5_m7);
    xmm1 = _mm_max_epi16(bit_met_p3_m7, bit_met_p1_m7);
    xmm2 = _mm_max_epi16(bit_met_m1_m7, bit_met_m3_m7);
    xmm3 = _mm_max_epi16(bit_met_m5_m7, bit_met_m7_m7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);

    xmm0 = _mm_max_epi16(bit_met_p7_m5, bit_met_p5_m5);
    xmm1 = _mm_max_epi16(bit_met_p3_m5, bit_met_p1_m5);
    xmm2 = _mm_max_epi16(bit_met_m1_m5, bit_met_m3_m5);
    xmm3 = _mm_max_epi16(bit_met_m5_m5, bit_met_m7_m5);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(xmm4, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_m3, bit_met_p5_m3);
    xmm1 = _mm_max_epi16(bit_met_p3_m3, bit_met_p1_m3);
    xmm2 = _mm_max_epi16(bit_met_m1_m3, bit_met_m3_m3);
    xmm3 = _mm_max_epi16(bit_met_m5_m3, bit_met_m7_m3);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_p3, bit_met_p5_p3);
    xmm1 = _mm_max_epi16(bit_met_p3_p3, bit_met_p1_p3);
    xmm2 = _mm_max_epi16(bit_met_m1_p3, bit_met_m3_p3);
    xmm3 = _mm_max_epi16(bit_met_m5_p3, bit_met_m7_p3);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_p5, bit_met_p5_p5);
    xmm1 = _mm_max_epi16(bit_met_p3_p5, bit_met_p1_p5);
    xmm2 = _mm_max_epi16(bit_met_m1_p5, bit_met_m3_p5);
    xmm3 = _mm_max_epi16(bit_met_m5_p5, bit_met_m7_p5);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);

    y2i = _mm_subs_epi16(logmax_num_re0, logmax_den_re0);


    // map to output stream, difficult to do in SIMD since we have 6 16bit LLRs
    // RE 1
    j = 24*i;
    stream0_out[j + 0] = ((short *)&y0r)[0];
    stream0_out[j + 1] = ((short *)&y1r)[0];
    stream0_out[j + 2] = ((short *)&y2r)[0];
    stream0_out[j + 3] = ((short *)&y0i)[0];
    stream0_out[j + 4] = ((short *)&y1i)[0];
    stream0_out[j + 5] = ((short *)&y2i)[0];
    // RE 2
    stream0_out[j + 6] = ((short *)&y0r)[1];
    stream0_out[j + 7] = ((short *)&y1r)[1];
    stream0_out[j + 8] = ((short *)&y2r)[1];
    stream0_out[j + 9] = ((short *)&y0i)[1];
    stream0_out[j + 10] = ((short *)&y1i)[1];
    stream0_out[j + 11] = ((short *)&y2i)[1];
    // RE 3
    stream0_out[j + 12] = ((short *)&y0r)[2];
    stream0_out[j + 13] = ((short *)&y1r)[2];
    stream0_out[j + 14] = ((short *)&y2r)[2];
    stream0_out[j + 15] = ((short *)&y0i)[2];
    stream0_out[j + 16] = ((short *)&y1i)[2];
    stream0_out[j + 17] = ((short *)&y2i)[2];
    // RE 4
    stream0_out[j + 18] = ((short *)&y0r)[3];
    stream0_out[j + 19] = ((short *)&y1r)[3];
    stream0_out[j + 20] = ((short *)&y2r)[3];
    stream0_out[j + 21] = ((short *)&y0i)[3];
    stream0_out[j + 22] = ((short *)&y1i)[3];
    stream0_out[j + 23] = ((short *)&y2i)[3];
    // RE 5
    stream0_out[j + 24] = ((short *)&y0r)[4];
    stream0_out[j + 25] = ((short *)&y1r)[4];
    stream0_out[j + 26] = ((short *)&y2r)[4];
    stream0_out[j + 27] = ((short *)&y0i)[4];
    stream0_out[j + 28] = ((short *)&y1i)[4];
    stream0_out[j + 29] = ((short *)&y2i)[4];
    // RE 6
    stream0_out[j + 30] = ((short *)&y0r)[5];
    stream0_out[j + 31] = ((short *)&y1r)[5];
    stream0_out[j + 32] = ((short *)&y2r)[5];
    stream0_out[j + 33] = ((short *)&y0i)[5];
    stream0_out[j + 34] = ((short *)&y1i)[5];
    stream0_out[j + 35] = ((short *)&y2i)[5];
    // RE 7
    stream0_out[j + 36] = ((short *)&y0r)[6];
    stream0_out[j + 37] = ((short *)&y1r)[6];
    stream0_out[j + 38] = ((short *)&y2r)[6];
    stream0_out[j + 39] = ((short *)&y0i)[6];
    stream0_out[j + 40] = ((short *)&y1i)[6];
    stream0_out[j + 41] = ((short *)&y2i)[6];
    // RE 8
    stream0_out[j + 42] = ((short *)&y0r)[7];
    stream0_out[j + 43] = ((short *)&y1r)[7];
    stream0_out[j + 44] = ((short *)&y2r)[7];
    stream0_out[j + 45] = ((short *)&y0i)[7];
    stream0_out[j + 46] = ((short *)&y1i)[7];
    stream0_out[j + 47] = ((short *)&y2i)[7];
#elif defined(__arm__)

#endif
  }

#if defined(__x86_64__) || defined(__i386__)
  _mm_empty();
  _m_empty();
#endif
}


int nr_dlsch_64qam_qpsk_llr(NR_DL_FRAME_PARMS *frame_parms,
                         int32_t **rxdataF_comp,
                         int32_t **rxdataF_comp_i,
                         int32_t **dl_ch_mag,
                         int32_t **rho_i,
                         int16_t *dlsch_llr,
                         uint8_t symbol,
                         uint8_t first_symbol_flag,
                         uint16_t nb_rb,
                         uint16_t pbch_pss_sss_adjust,
                         int16_t **llr16p)
{

  int16_t *rxF      = (int16_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
  int16_t *rxF_i    = (int16_t*)&rxdataF_comp_i[0][(symbol*frame_parms->N_RB_DL*12)];
  int16_t *ch_mag   = (int16_t*)&dl_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)];
  int16_t *rho      = (int16_t*)&rho_i[0][(symbol*frame_parms->N_RB_DL*12)];
  int16_t *llr16;
  int len;
  uint8_t symbol_mod = (symbol >= (7-frame_parms->Ncp))? (symbol-(7-frame_parms->Ncp)) : symbol;

  //first symbol has different structure due to more pilots
  if (first_symbol_flag == 1) {
    llr16 = (int16_t*)dlsch_llr;
  } else {
    llr16 = (int16_t*)(*llr16p);
  }

  AssertFatal(llr16!=NULL,"nr_dlsch_16qam_64qam_llr:llr is null, symbol %d\n",symbol);

  if ((symbol_mod==0) || (symbol_mod==(4-frame_parms->Ncp))) {
    // if symbol has pilots
    if (frame_parms->nb_antenna_ports_gNB!=1)
      // in 2 antenna ports we have 8 REs per symbol per RB
      len = (nb_rb*8) - (2*pbch_pss_sss_adjust/3);
    else
      // for 1 antenna port we have 10 REs per symbol per RB
      len = (nb_rb*10) - (5*pbch_pss_sss_adjust/6);
  } else {
    // symbol has no pilots
    len = (nb_rb*12) - pbch_pss_sss_adjust;
  }

  nr_qam64_qpsk((short *)rxF,
             (short *)rxF_i,
             (short *)ch_mag,
             (short *)llr16,
             (short *)rho,
             len);

  llr16 += (6*len);
  *llr16p = (short *)llr16;
  return(0);
}



void nr_qam64_qam16(short *stream0_in,
                 short *stream1_in,
                 short *ch_mag,
                 short *ch_mag_i,
                 short *stream0_out,
                 short *rho01,
                 int length
     )
{

  /*
    Author: S. Wagner
    Date: 31-07-12

    Input:
    stream0_in:  MF filter for 1st stream, i.e., y0=h0'*y
    stream1_in:  MF filter for 2nd stream, i.e., y1=h1'*y
    ch_mag:      4*h0/sqrt(42), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
    ch_mag_i:    4*h1/sqrt(42), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
    rho01:       Channel cross correlation, i.e., h1'*h0

    Output:
    stream0_out: output LLRs for 1st stream
  */

#if defined(__x86_64__) || defined(__i386__)

  __m128i *rho01_128i      = (__m128i *)rho01;
  __m128i *stream0_128i_in = (__m128i *)stream0_in;
  __m128i *stream1_128i_in = (__m128i *)stream1_in;
  __m128i *ch_mag_128i     = (__m128i *)ch_mag;
  __m128i *ch_mag_128i_i   = (__m128i *)ch_mag_i;

  __m128i ONE_OVER_SQRT_42 = _mm_set1_epi16(10112); // round(1/sqrt(42)*2^16)
  __m128i THREE_OVER_SQRT_42 = _mm_set1_epi16(30337); // round(3/sqrt(42)*2^16)
  __m128i FIVE_OVER_SQRT_42 = _mm_set1_epi16(25281); // round(5/sqrt(42)*2^15)
  __m128i SEVEN_OVER_SQRT_42 = _mm_set1_epi16(17697); // round(5/sqrt(42)*2^15)
  __m128i FORTYNINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(30969); // round(49/(4*sqrt(42))*2^14), Q2.14
  __m128i THIRTYSEVEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(23385); // round(37/(4*sqrt(42))*2^14), Q2.14
  __m128i TWENTYFIVE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(31601); // round(25/(4*sqrt(42))*2^15)
  __m128i TWENTYNINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(18329); // round(29/(4*sqrt(42))*2^15), Q2.14
  __m128i SEVENTEEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(21489); // round(17/(4*sqrt(42))*2^15)
  __m128i NINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(11376); // round(9/(4*sqrt(42))*2^15)
  __m128i THIRTEEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(16433); // round(13/(4*sqrt(42))*2^15)
  __m128i FIVE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(6320); // round(5/(4*sqrt(42))*2^15)
  __m128i ONE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(1264); // round(1/(4*sqrt(42))*2^15)
  __m128i ONE_OVER_SQRT_10_Q15 = _mm_set1_epi16(10362); // round(1/sqrt(10)*2^15)
  __m128i THREE_OVER_SQRT_10 = _mm_set1_epi16(31086); // round(3/sqrt(10)*2^15)
  __m128i SQRT_10_OVER_FOUR = _mm_set1_epi16(25905); // round(sqrt(10)/4*2^15)


  __m128i ch_mag_int;
  __m128i ch_mag_des;
  __m128i ch_mag_98_over_42_with_sigma2;
  __m128i ch_mag_74_over_42_with_sigma2;
  __m128i ch_mag_58_over_42_with_sigma2;
  __m128i ch_mag_50_over_42_with_sigma2;
  __m128i ch_mag_34_over_42_with_sigma2;
  __m128i ch_mag_18_over_42_with_sigma2;
  __m128i ch_mag_26_over_42_with_sigma2;
  __m128i ch_mag_10_over_42_with_sigma2;
  __m128i ch_mag_2_over_42_with_sigma2;
  __m128i  y0r_one_over_sqrt_21;
  __m128i  y0r_three_over_sqrt_21;
  __m128i  y0r_five_over_sqrt_21;
  __m128i  y0r_seven_over_sqrt_21;
  __m128i  y0i_one_over_sqrt_21;
  __m128i  y0i_three_over_sqrt_21;
  __m128i  y0i_five_over_sqrt_21;
  __m128i  y0i_seven_over_sqrt_21;

#elif defined(__arm__)

#endif
  int i,j;



  for (i=0; i<length>>2; i+=2) {

#if defined(__x86_64__) || defined(__i386__)
    // Get rho
    xmm0 = rho01_128i[i];
    xmm1 = rho01_128i[i+1];
    xmm0 = _mm_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    //xmm0 = [Re(0,1) Re(2,3) Im(0,1) Im(2,3)]
    //xmm1 = [Re(4,5) Re(6,7) Im(4,5) Im(6,7)]
    xmm2 = _mm_unpacklo_epi64(xmm0,xmm1); // Re(rho)
    xmm3 = _mm_unpackhi_epi64(xmm0,xmm1); // Im(rho)
    rho_rpi = _mm_adds_epi16(xmm2,xmm3); // rho = Re(rho) + Im(rho)
    rho_rmi = _mm_subs_epi16(xmm2,xmm3); // rho* = Re(rho) - Im(rho)

    // Compute the different rhos
    rho_rpi_1_1 = _mm_mulhi_epi16(rho_rpi, ONE_OVER_SQRT_42);
    rho_rmi_1_1 = _mm_mulhi_epi16(rho_rmi, ONE_OVER_SQRT_42);
    rho_rpi_3_3 = _mm_mulhi_epi16(rho_rpi, THREE_OVER_SQRT_42);
    rho_rmi_3_3 = _mm_mulhi_epi16(rho_rmi, THREE_OVER_SQRT_42);
    rho_rpi_5_5 = _mm_mulhi_epi16(rho_rpi, FIVE_OVER_SQRT_42);
    rho_rmi_5_5 = _mm_mulhi_epi16(rho_rmi, FIVE_OVER_SQRT_42);
    rho_rpi_7_7 = _mm_mulhi_epi16(rho_rpi, SEVEN_OVER_SQRT_42);
    rho_rmi_7_7 = _mm_mulhi_epi16(rho_rmi, SEVEN_OVER_SQRT_42);

    rho_rpi_5_5 = _mm_slli_epi16(rho_rpi_5_5, 1);
    rho_rmi_5_5 = _mm_slli_epi16(rho_rmi_5_5, 1);
    rho_rpi_7_7 = _mm_slli_epi16(rho_rpi_7_7, 2);
    rho_rmi_7_7 = _mm_slli_epi16(rho_rmi_7_7, 2);

    xmm4 = _mm_mulhi_epi16(xmm2, ONE_OVER_SQRT_42);
    xmm5 = _mm_mulhi_epi16(xmm3, ONE_OVER_SQRT_42);
    xmm6 = _mm_mulhi_epi16(xmm3, THREE_OVER_SQRT_42);
    xmm7 = _mm_mulhi_epi16(xmm3, FIVE_OVER_SQRT_42);
    xmm8 = _mm_mulhi_epi16(xmm3, SEVEN_OVER_SQRT_42);
    xmm7 = _mm_slli_epi16(xmm7, 1);
    xmm8 = _mm_slli_epi16(xmm8, 2);

    rho_rpi_1_3 = _mm_adds_epi16(xmm4, xmm6);
    rho_rmi_1_3 = _mm_subs_epi16(xmm4, xmm6);
    rho_rpi_1_5 = _mm_adds_epi16(xmm4, xmm7);
    rho_rmi_1_5 = _mm_subs_epi16(xmm4, xmm7);
    rho_rpi_1_7 = _mm_adds_epi16(xmm4, xmm8);
    rho_rmi_1_7 = _mm_subs_epi16(xmm4, xmm8);

    xmm4 = _mm_mulhi_epi16(xmm2, THREE_OVER_SQRT_42);
    rho_rpi_3_1 = _mm_adds_epi16(xmm4, xmm5);
    rho_rmi_3_1 = _mm_subs_epi16(xmm4, xmm5);
    rho_rpi_3_5 = _mm_adds_epi16(xmm4, xmm7);
    rho_rmi_3_5 = _mm_subs_epi16(xmm4, xmm7);
    rho_rpi_3_7 = _mm_adds_epi16(xmm4, xmm8);
    rho_rmi_3_7 = _mm_subs_epi16(xmm4, xmm8);

    xmm4 = _mm_mulhi_epi16(xmm2, FIVE_OVER_SQRT_42);
    xmm4 = _mm_slli_epi16(xmm4, 1);
    rho_rpi_5_1 = _mm_adds_epi16(xmm4, xmm5);
    rho_rmi_5_1 = _mm_subs_epi16(xmm4, xmm5);
    rho_rpi_5_3 = _mm_adds_epi16(xmm4, xmm6);
    rho_rmi_5_3 = _mm_subs_epi16(xmm4, xmm6);
    rho_rpi_5_7 = _mm_adds_epi16(xmm4, xmm8);
    rho_rmi_5_7 = _mm_subs_epi16(xmm4, xmm8);

    xmm4 = _mm_mulhi_epi16(xmm2, SEVEN_OVER_SQRT_42);
    xmm4 = _mm_slli_epi16(xmm4, 2);
    rho_rpi_7_1 = _mm_adds_epi16(xmm4, xmm5);
    rho_rmi_7_1 = _mm_subs_epi16(xmm4, xmm5);
    rho_rpi_7_3 = _mm_adds_epi16(xmm4, xmm6);
    rho_rmi_7_3 = _mm_subs_epi16(xmm4, xmm6);
    rho_rpi_7_5 = _mm_adds_epi16(xmm4, xmm7);
    rho_rmi_7_5 = _mm_subs_epi16(xmm4, xmm7);

    // Rearrange interfering MF output
    xmm0 = stream1_128i_in[i];
    xmm1 = stream1_128i_in[i+1];
    xmm0 = _mm_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    //xmm0 = [Re(0,1) Re(2,3) Im(0,1) Im(2,3)]
    //xmm1 = [Re(4,5) Re(6,7) Im(4,5) Im(6,7)]
    y1r = _mm_unpacklo_epi64(xmm0,xmm1); //[y1r(1),y1r(2),y1r(3),y1r(4)]
    y1i = _mm_unpackhi_epi64(xmm0,xmm1); //[y1i(1),y1i(2),y1i(3),y1i(4)]

    // Psi_r calculation from rho_rpi or rho_rmi
    xmm0 = _mm_setzero_si128(); // ZERO for abs_pi16
    xmm2 = _mm_subs_epi16(rho_rpi_7_7, y1r);
    psi_r_p7_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_7_5, y1r);
    psi_r_p7_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_7_3, y1r);
    psi_r_p7_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_7_1, y1r);
    psi_r_p7_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_7_1, y1r);
    psi_r_p7_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_7_3, y1r);
    psi_r_p7_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_7_5, y1r);
    psi_r_p7_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_7_7, y1r);
    psi_r_p7_m7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_5_7, y1r);
    psi_r_p5_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_5_5, y1r);
    psi_r_p5_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_5_3, y1r);
    psi_r_p5_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_5_1, y1r);
    psi_r_p5_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_5_1, y1r);
    psi_r_p5_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_5_3, y1r);
    psi_r_p5_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_5_5, y1r);
    psi_r_p5_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_5_7, y1r);
    psi_r_p5_m7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_3_7, y1r);
    psi_r_p3_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_3_5, y1r);
    psi_r_p3_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_3_3, y1r);
    psi_r_p3_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_3_1, y1r);
    psi_r_p3_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_3_1, y1r);
    psi_r_p3_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_3_3, y1r);
    psi_r_p3_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_3_5, y1r);
    psi_r_p3_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_3_7, y1r);
    psi_r_p3_m7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_1_7, y1r);
    psi_r_p1_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_1_5, y1r);
    psi_r_p1_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_1_3, y1r);
    psi_r_p1_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_1_1, y1r);
    psi_r_p1_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_1_1, y1r);
    psi_r_p1_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_1_3, y1r);
    psi_r_p1_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_1_5, y1r);
    psi_r_p1_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_1_7, y1r);
    psi_r_p1_m7 = _mm_abs_epi16(xmm2);

    xmm2 = _mm_adds_epi16(rho_rmi_1_7, y1r);
    psi_r_m1_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_1_5, y1r);
    psi_r_m1_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_1_3, y1r);
    psi_r_m1_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_1_1, y1r);
    psi_r_m1_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_1_1, y1r);
    psi_r_m1_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_1_3, y1r);
    psi_r_m1_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_1_5, y1r);
    psi_r_m1_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_1_7, y1r);
    psi_r_m1_m7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_3_7, y1r);
    psi_r_m3_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_3_5, y1r);
    psi_r_m3_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_3_3, y1r);
    psi_r_m3_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_3_1, y1r);
    psi_r_m3_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_3_1, y1r);
    psi_r_m3_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_3_3, y1r);
    psi_r_m3_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_3_5, y1r);
    psi_r_m3_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_3_7, y1r);
    psi_r_m3_m7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_5_7, y1r);
    psi_r_m5_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_5_5, y1r);
    psi_r_m5_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_5_3, y1r);
    psi_r_m5_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_5_1, y1r);
    psi_r_m5_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_5_1, y1r);
    psi_r_m5_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_5_3, y1r);
    psi_r_m5_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_5_5, y1r);
    psi_r_m5_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_5_7, y1r);
    psi_r_m5_m7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_7_7, y1r);
    psi_r_m7_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_7_5, y1r);
    psi_r_m7_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_7_3, y1r);
    psi_r_m7_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_7_1, y1r);
    psi_r_m7_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_7_1, y1r);
    psi_r_m7_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_7_3, y1r);
    psi_r_m7_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_7_5, y1r);
    psi_r_m7_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_7_7, y1r);
    psi_r_m7_m7 = _mm_abs_epi16(xmm2);

    // Psi_i calculation from rho_rpi or rho_rmi
    xmm2 = _mm_subs_epi16(rho_rmi_7_7, y1i);
    psi_i_p7_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_5_7, y1i);
    psi_i_p7_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_3_7, y1i);
    psi_i_p7_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_1_7, y1i);
    psi_i_p7_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_1_7, y1i);
    psi_i_p7_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_3_7, y1i);
    psi_i_p7_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_5_7, y1i);
    psi_i_p7_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_7_7, y1i);
    psi_i_p7_m7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_7_5, y1i);
    psi_i_p5_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_5_5, y1i);
    psi_i_p5_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_3_5, y1i);
    psi_i_p5_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_1_5, y1i);
    psi_i_p5_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_1_5, y1i);
    psi_i_p5_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_3_5, y1i);
    psi_i_p5_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_5_5, y1i);
    psi_i_p5_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_7_5, y1i);
    psi_i_p5_m7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_7_3, y1i);
    psi_i_p3_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_5_3, y1i);
    psi_i_p3_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_3_3, y1i);
    psi_i_p3_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_1_3, y1i);
    psi_i_p3_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_1_3, y1i);
    psi_i_p3_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_3_3, y1i);
    psi_i_p3_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_5_3, y1i);
    psi_i_p3_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_7_3, y1i);
    psi_i_p3_m7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_7_1, y1i);
    psi_i_p1_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_5_1, y1i);
    psi_i_p1_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_3_1, y1i);
    psi_i_p1_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_1_1, y1i);
    psi_i_p1_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_1_1, y1i);
    psi_i_p1_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_3_1, y1i);
    psi_i_p1_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_5_1, y1i);
    psi_i_p1_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_7_1, y1i);
    psi_i_p1_m7 = _mm_abs_epi16(xmm2);

    xmm2 = _mm_subs_epi16(rho_rpi_7_1, y1i);
    psi_i_m1_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_5_1, y1i);
    psi_i_m1_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_3_1, y1i);
    psi_i_m1_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_1_1, y1i);
    psi_i_m1_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_1_1, y1i);
    psi_i_m1_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_3_1, y1i);
    psi_i_m1_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_5_1, y1i);
    psi_i_m1_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_7_1, y1i);
    psi_i_m1_m7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_7_3, y1i);
    psi_i_m3_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_5_3, y1i);
    psi_i_m3_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_3_3, y1i);
    psi_i_m3_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_1_3, y1i);
    psi_i_m3_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_1_3, y1i);
    psi_i_m3_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_3_3, y1i);
    psi_i_m3_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_5_3, y1i);
    psi_i_m3_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_7_3, y1i);
    psi_i_m3_m7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_7_5, y1i);
    psi_i_m5_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_5_5, y1i);
    psi_i_m5_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_3_5, y1i);
    psi_i_m5_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_1_5, y1i);
    psi_i_m5_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_1_5, y1i);
    psi_i_m5_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_3_5, y1i);
    psi_i_m5_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_5_5, y1i);
    psi_i_m5_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_7_5, y1i);
    psi_i_m5_m7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_7_7, y1i);
    psi_i_m7_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_5_7, y1i);
    psi_i_m7_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_3_7, y1i);
    psi_i_m7_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_1_7, y1i);
    psi_i_m7_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_1_7, y1i);
    psi_i_m7_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_3_7, y1i);
    psi_i_m7_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_5_7, y1i);
    psi_i_m7_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_7_7, y1i);
    psi_i_m7_m7 = _mm_abs_epi16(xmm2);


    // Rearrange desired MF output
    xmm0 = stream0_128i_in[i];
    xmm1 = stream0_128i_in[i+1];
    xmm0 = _mm_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    //xmm0 = [Re(0,1) Re(2,3) Im(0,1) Im(2,3)]
    //xmm1 = [Re(4,5) Re(6,7) Im(4,5) Im(6,7)]
    y0r = _mm_unpacklo_epi64(xmm0,xmm1); // = [y0r(1),y0r(2),y0r(3),y0r(4)]
    y0i = _mm_unpackhi_epi64(xmm0,xmm1);

    // Rearrange desired channel magnitudes
    xmm2 = ch_mag_128i[i]; // = [|h|^2(1),|h|^2(1),|h|^2(2),|h|^2(2)]*(2/sqrt(10))
    xmm3 = ch_mag_128i[i+1]; // = [|h|^2(3),|h|^2(3),|h|^2(4),|h|^2(4)]*(2/sqrt(10))
    xmm2 = _mm_shufflelo_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm2 = _mm_shufflehi_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm2 = _mm_shuffle_epi32(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shufflelo_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shufflehi_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shuffle_epi32(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    ch_mag_des = _mm_unpacklo_epi64(xmm2,xmm3);

    // Rearrange interfering channel magnitudes
    xmm2 = ch_mag_128i_i[i];
    xmm3 = ch_mag_128i_i[i+1];
    xmm2 = _mm_shufflelo_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm2 = _mm_shufflehi_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm2 = _mm_shuffle_epi32(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shufflelo_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shufflehi_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shuffle_epi32(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    ch_mag_int  = _mm_unpacklo_epi64(xmm2,xmm3);

    y0r_one_over_sqrt_21   = _mm_mulhi_epi16(y0r, ONE_OVER_SQRT_42);
    y0r_three_over_sqrt_21 = _mm_mulhi_epi16(y0r, THREE_OVER_SQRT_42);
    y0r_five_over_sqrt_21  = _mm_mulhi_epi16(y0r, FIVE_OVER_SQRT_42);
    y0r_five_over_sqrt_21  = _mm_slli_epi16(y0r_five_over_sqrt_21, 1);
    y0r_seven_over_sqrt_21 = _mm_mulhi_epi16(y0r, SEVEN_OVER_SQRT_42);
    y0r_seven_over_sqrt_21 = _mm_slli_epi16(y0r_seven_over_sqrt_21, 2); // Q2.14

    y0i_one_over_sqrt_21   = _mm_mulhi_epi16(y0i, ONE_OVER_SQRT_42);
    y0i_three_over_sqrt_21 = _mm_mulhi_epi16(y0i, THREE_OVER_SQRT_42);
    y0i_five_over_sqrt_21  = _mm_mulhi_epi16(y0i, FIVE_OVER_SQRT_42);
    y0i_five_over_sqrt_21  = _mm_slli_epi16(y0i_five_over_sqrt_21, 1);
    y0i_seven_over_sqrt_21 = _mm_mulhi_epi16(y0i, SEVEN_OVER_SQRT_42);
    y0i_seven_over_sqrt_21 = _mm_slli_epi16(y0i_seven_over_sqrt_21, 2); // Q2.14

    y0_p_7_1 = _mm_adds_epi16(y0r_seven_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_p_7_3 = _mm_adds_epi16(y0r_seven_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_p_7_5 = _mm_adds_epi16(y0r_seven_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_p_7_7 = _mm_adds_epi16(y0r_seven_over_sqrt_21, y0i_seven_over_sqrt_21);
    y0_p_5_1 = _mm_adds_epi16(y0r_five_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_p_5_3 = _mm_adds_epi16(y0r_five_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_p_5_5 = _mm_adds_epi16(y0r_five_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_p_5_7 = _mm_adds_epi16(y0r_five_over_sqrt_21, y0i_seven_over_sqrt_21);
    y0_p_3_1 = _mm_adds_epi16(y0r_three_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_p_3_3 = _mm_adds_epi16(y0r_three_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_p_3_5 = _mm_adds_epi16(y0r_three_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_p_3_7 = _mm_adds_epi16(y0r_three_over_sqrt_21, y0i_seven_over_sqrt_21);
    y0_p_1_1 = _mm_adds_epi16(y0r_one_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_p_1_3 = _mm_adds_epi16(y0r_one_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_p_1_5 = _mm_adds_epi16(y0r_one_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_p_1_7 = _mm_adds_epi16(y0r_one_over_sqrt_21, y0i_seven_over_sqrt_21);

    y0_m_1_1 = _mm_subs_epi16(y0r_one_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_m_1_3 = _mm_subs_epi16(y0r_one_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_m_1_5 = _mm_subs_epi16(y0r_one_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_m_1_7 = _mm_subs_epi16(y0r_one_over_sqrt_21, y0i_seven_over_sqrt_21);
    y0_m_3_1 = _mm_subs_epi16(y0r_three_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_m_3_3 = _mm_subs_epi16(y0r_three_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_m_3_5 = _mm_subs_epi16(y0r_three_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_m_3_7 = _mm_subs_epi16(y0r_three_over_sqrt_21, y0i_seven_over_sqrt_21);
    y0_m_5_1 = _mm_subs_epi16(y0r_five_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_m_5_3 = _mm_subs_epi16(y0r_five_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_m_5_5 = _mm_subs_epi16(y0r_five_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_m_5_7 = _mm_subs_epi16(y0r_five_over_sqrt_21, y0i_seven_over_sqrt_21);
    y0_m_7_1 = _mm_subs_epi16(y0r_seven_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_m_7_3 = _mm_subs_epi16(y0r_seven_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_m_7_5 = _mm_subs_epi16(y0r_seven_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_m_7_7 = _mm_subs_epi16(y0r_seven_over_sqrt_21, y0i_seven_over_sqrt_21);

    interference_abs_epi16(psi_r_p7_p7, ch_mag_int, a_r_p7_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p7_p5, ch_mag_int, a_r_p7_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p7_p3, ch_mag_int, a_r_p7_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p7_p1, ch_mag_int, a_r_p7_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p7_m1, ch_mag_int, a_r_p7_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p7_m3, ch_mag_int, a_r_p7_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p7_m5, ch_mag_int, a_r_p7_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p7_m7, ch_mag_int, a_r_p7_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p5_p7, ch_mag_int, a_r_p5_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p5_p5, ch_mag_int, a_r_p5_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p5_p3, ch_mag_int, a_r_p5_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p5_p1, ch_mag_int, a_r_p5_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p5_m1, ch_mag_int, a_r_p5_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p5_m3, ch_mag_int, a_r_p5_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p5_m5, ch_mag_int, a_r_p5_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p5_m7, ch_mag_int, a_r_p5_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p3_p7, ch_mag_int, a_r_p3_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p3_p5, ch_mag_int, a_r_p3_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p3_p3, ch_mag_int, a_r_p3_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p3_p1, ch_mag_int, a_r_p3_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p3_m1, ch_mag_int, a_r_p3_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p3_m3, ch_mag_int, a_r_p3_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p3_m5, ch_mag_int, a_r_p3_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p3_m7, ch_mag_int, a_r_p3_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p1_p7, ch_mag_int, a_r_p1_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p1_p5, ch_mag_int, a_r_p1_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p1_p3, ch_mag_int, a_r_p1_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p1_p1, ch_mag_int, a_r_p1_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p1_m1, ch_mag_int, a_r_p1_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p1_m3, ch_mag_int, a_r_p1_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p1_m5, ch_mag_int, a_r_p1_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_p1_m7, ch_mag_int, a_r_p1_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m1_p7, ch_mag_int, a_r_m1_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m1_p5, ch_mag_int, a_r_m1_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m1_p3, ch_mag_int, a_r_m1_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m1_p1, ch_mag_int, a_r_m1_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m1_m1, ch_mag_int, a_r_m1_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m1_m3, ch_mag_int, a_r_m1_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m1_m5, ch_mag_int, a_r_m1_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m1_m7, ch_mag_int, a_r_m1_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m3_p7, ch_mag_int, a_r_m3_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m3_p5, ch_mag_int, a_r_m3_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m3_p3, ch_mag_int, a_r_m3_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m3_p1, ch_mag_int, a_r_m3_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m3_m1, ch_mag_int, a_r_m3_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m3_m3, ch_mag_int, a_r_m3_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m3_m5, ch_mag_int, a_r_m3_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m3_m7, ch_mag_int, a_r_m3_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m5_p7, ch_mag_int, a_r_m5_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m5_p5, ch_mag_int, a_r_m5_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m5_p3, ch_mag_int, a_r_m5_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m5_p1, ch_mag_int, a_r_m5_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m5_m1, ch_mag_int, a_r_m5_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m5_m3, ch_mag_int, a_r_m5_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m5_m5, ch_mag_int, a_r_m5_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m5_m7, ch_mag_int, a_r_m5_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m7_p7, ch_mag_int, a_r_m7_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m7_p5, ch_mag_int, a_r_m7_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m7_p3, ch_mag_int, a_r_m7_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m7_p1, ch_mag_int, a_r_m7_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m7_m1, ch_mag_int, a_r_m7_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m7_m3, ch_mag_int, a_r_m7_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m7_m5, ch_mag_int, a_r_m7_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_r_m7_m7, ch_mag_int, a_r_m7_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);

    interference_abs_epi16(psi_i_p7_p7, ch_mag_int, a_i_p7_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p7_p5, ch_mag_int, a_i_p7_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p7_p3, ch_mag_int, a_i_p7_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p7_p1, ch_mag_int, a_i_p7_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p7_m1, ch_mag_int, a_i_p7_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p7_m3, ch_mag_int, a_i_p7_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p7_m5, ch_mag_int, a_i_p7_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p7_m7, ch_mag_int, a_i_p7_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p5_p7, ch_mag_int, a_i_p5_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p5_p5, ch_mag_int, a_i_p5_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p5_p3, ch_mag_int, a_i_p5_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p5_p1, ch_mag_int, a_i_p5_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p5_m1, ch_mag_int, a_i_p5_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p5_m3, ch_mag_int, a_i_p5_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p5_m5, ch_mag_int, a_i_p5_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p5_m7, ch_mag_int, a_i_p5_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p3_p7, ch_mag_int, a_i_p3_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p3_p5, ch_mag_int, a_i_p3_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p3_p3, ch_mag_int, a_i_p3_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p3_p1, ch_mag_int, a_i_p3_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p3_m1, ch_mag_int, a_i_p3_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p3_m3, ch_mag_int, a_i_p3_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p3_m5, ch_mag_int, a_i_p3_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p3_m7, ch_mag_int, a_i_p3_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p1_p7, ch_mag_int, a_i_p1_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p1_p5, ch_mag_int, a_i_p1_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p1_p3, ch_mag_int, a_i_p1_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p1_p1, ch_mag_int, a_i_p1_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p1_m1, ch_mag_int, a_i_p1_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p1_m3, ch_mag_int, a_i_p1_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p1_m5, ch_mag_int, a_i_p1_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_p1_m7, ch_mag_int, a_i_p1_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m1_p7, ch_mag_int, a_i_m1_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m1_p5, ch_mag_int, a_i_m1_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m1_p3, ch_mag_int, a_i_m1_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m1_p1, ch_mag_int, a_i_m1_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m1_m1, ch_mag_int, a_i_m1_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m1_m3, ch_mag_int, a_i_m1_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m1_m5, ch_mag_int, a_i_m1_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m1_m7, ch_mag_int, a_i_m1_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m3_p7, ch_mag_int, a_i_m3_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m3_p5, ch_mag_int, a_i_m3_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m3_p3, ch_mag_int, a_i_m3_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m3_p1, ch_mag_int, a_i_m3_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m3_m1, ch_mag_int, a_i_m3_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m3_m3, ch_mag_int, a_i_m3_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m3_m5, ch_mag_int, a_i_m3_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m3_m7, ch_mag_int, a_i_m3_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m5_p7, ch_mag_int, a_i_m5_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m5_p5, ch_mag_int, a_i_m5_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m5_p3, ch_mag_int, a_i_m5_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m5_p1, ch_mag_int, a_i_m5_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m5_m1, ch_mag_int, a_i_m5_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m5_m3, ch_mag_int, a_i_m5_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m5_m5, ch_mag_int, a_i_m5_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m5_m7, ch_mag_int, a_i_m5_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m7_p7, ch_mag_int, a_i_m7_p7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m7_p5, ch_mag_int, a_i_m7_p5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m7_p3, ch_mag_int, a_i_m7_p3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m7_p1, ch_mag_int, a_i_m7_p1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m7_m1, ch_mag_int, a_i_m7_m1, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m7_m3, ch_mag_int, a_i_m7_m3, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m7_m5, ch_mag_int, a_i_m7_m5, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);
    interference_abs_epi16(psi_i_m7_m7, ch_mag_int, a_i_m7_m7, ONE_OVER_SQRT_10_Q15, THREE_OVER_SQRT_10);

    // Calculation of a group of two terms in the bit metric involving product of psi and interference
    prodsum_psi_a_epi16(psi_r_p7_p7, a_r_p7_p7, psi_i_p7_p7, a_i_p7_p7, psi_a_p7_p7);
    prodsum_psi_a_epi16(psi_r_p7_p5, a_r_p7_p5, psi_i_p7_p5, a_i_p7_p5, psi_a_p7_p5);
    prodsum_psi_a_epi16(psi_r_p7_p3, a_r_p7_p3, psi_i_p7_p3, a_i_p7_p3, psi_a_p7_p3);
    prodsum_psi_a_epi16(psi_r_p7_p1, a_r_p7_p1, psi_i_p7_p1, a_i_p7_p1, psi_a_p7_p1);
    prodsum_psi_a_epi16(psi_r_p7_m1, a_r_p7_m1, psi_i_p7_m1, a_i_p7_m1, psi_a_p7_m1);
    prodsum_psi_a_epi16(psi_r_p7_m3, a_r_p7_m3, psi_i_p7_m3, a_i_p7_m3, psi_a_p7_m3);
    prodsum_psi_a_epi16(psi_r_p7_m5, a_r_p7_m5, psi_i_p7_m5, a_i_p7_m5, psi_a_p7_m5);
    prodsum_psi_a_epi16(psi_r_p7_m7, a_r_p7_m7, psi_i_p7_m7, a_i_p7_m7, psi_a_p7_m7);
    prodsum_psi_a_epi16(psi_r_p5_p7, a_r_p5_p7, psi_i_p5_p7, a_i_p5_p7, psi_a_p5_p7);
    prodsum_psi_a_epi16(psi_r_p5_p5, a_r_p5_p5, psi_i_p5_p5, a_i_p5_p5, psi_a_p5_p5);
    prodsum_psi_a_epi16(psi_r_p5_p3, a_r_p5_p3, psi_i_p5_p3, a_i_p5_p3, psi_a_p5_p3);
    prodsum_psi_a_epi16(psi_r_p5_p1, a_r_p5_p1, psi_i_p5_p1, a_i_p5_p1, psi_a_p5_p1);
    prodsum_psi_a_epi16(psi_r_p5_m1, a_r_p5_m1, psi_i_p5_m1, a_i_p5_m1, psi_a_p5_m1);
    prodsum_psi_a_epi16(psi_r_p5_m3, a_r_p5_m3, psi_i_p5_m3, a_i_p5_m3, psi_a_p5_m3);
    prodsum_psi_a_epi16(psi_r_p5_m5, a_r_p5_m5, psi_i_p5_m5, a_i_p5_m5, psi_a_p5_m5);
    prodsum_psi_a_epi16(psi_r_p5_m7, a_r_p5_m7, psi_i_p5_m7, a_i_p5_m7, psi_a_p5_m7);
    prodsum_psi_a_epi16(psi_r_p3_p7, a_r_p3_p7, psi_i_p3_p7, a_i_p3_p7, psi_a_p3_p7);
    prodsum_psi_a_epi16(psi_r_p3_p5, a_r_p3_p5, psi_i_p3_p5, a_i_p3_p5, psi_a_p3_p5);
    prodsum_psi_a_epi16(psi_r_p3_p3, a_r_p3_p3, psi_i_p3_p3, a_i_p3_p3, psi_a_p3_p3);
    prodsum_psi_a_epi16(psi_r_p3_p1, a_r_p3_p1, psi_i_p3_p1, a_i_p3_p1, psi_a_p3_p1);
    prodsum_psi_a_epi16(psi_r_p3_m1, a_r_p3_m1, psi_i_p3_m1, a_i_p3_m1, psi_a_p3_m1);
    prodsum_psi_a_epi16(psi_r_p3_m3, a_r_p3_m3, psi_i_p3_m3, a_i_p3_m3, psi_a_p3_m3);
    prodsum_psi_a_epi16(psi_r_p3_m5, a_r_p3_m5, psi_i_p3_m5, a_i_p3_m5, psi_a_p3_m5);
    prodsum_psi_a_epi16(psi_r_p3_m7, a_r_p3_m7, psi_i_p3_m7, a_i_p3_m7, psi_a_p3_m7);
    prodsum_psi_a_epi16(psi_r_p1_p7, a_r_p1_p7, psi_i_p1_p7, a_i_p1_p7, psi_a_p1_p7);
    prodsum_psi_a_epi16(psi_r_p1_p5, a_r_p1_p5, psi_i_p1_p5, a_i_p1_p5, psi_a_p1_p5);
    prodsum_psi_a_epi16(psi_r_p1_p3, a_r_p1_p3, psi_i_p1_p3, a_i_p1_p3, psi_a_p1_p3);
    prodsum_psi_a_epi16(psi_r_p1_p1, a_r_p1_p1, psi_i_p1_p1, a_i_p1_p1, psi_a_p1_p1);
    prodsum_psi_a_epi16(psi_r_p1_m1, a_r_p1_m1, psi_i_p1_m1, a_i_p1_m1, psi_a_p1_m1);
    prodsum_psi_a_epi16(psi_r_p1_m3, a_r_p1_m3, psi_i_p1_m3, a_i_p1_m3, psi_a_p1_m3);
    prodsum_psi_a_epi16(psi_r_p1_m5, a_r_p1_m5, psi_i_p1_m5, a_i_p1_m5, psi_a_p1_m5);
    prodsum_psi_a_epi16(psi_r_p1_m7, a_r_p1_m7, psi_i_p1_m7, a_i_p1_m7, psi_a_p1_m7);
    prodsum_psi_a_epi16(psi_r_m1_p7, a_r_m1_p7, psi_i_m1_p7, a_i_m1_p7, psi_a_m1_p7);
    prodsum_psi_a_epi16(psi_r_m1_p5, a_r_m1_p5, psi_i_m1_p5, a_i_m1_p5, psi_a_m1_p5);
    prodsum_psi_a_epi16(psi_r_m1_p3, a_r_m1_p3, psi_i_m1_p3, a_i_m1_p3, psi_a_m1_p3);
    prodsum_psi_a_epi16(psi_r_m1_p1, a_r_m1_p1, psi_i_m1_p1, a_i_m1_p1, psi_a_m1_p1);
    prodsum_psi_a_epi16(psi_r_m1_m1, a_r_m1_m1, psi_i_m1_m1, a_i_m1_m1, psi_a_m1_m1);
    prodsum_psi_a_epi16(psi_r_m1_m3, a_r_m1_m3, psi_i_m1_m3, a_i_m1_m3, psi_a_m1_m3);
    prodsum_psi_a_epi16(psi_r_m1_m5, a_r_m1_m5, psi_i_m1_m5, a_i_m1_m5, psi_a_m1_m5);
    prodsum_psi_a_epi16(psi_r_m1_m7, a_r_m1_m7, psi_i_m1_m7, a_i_m1_m7, psi_a_m1_m7);
    prodsum_psi_a_epi16(psi_r_m3_p7, a_r_m3_p7, psi_i_m3_p7, a_i_m3_p7, psi_a_m3_p7);
    prodsum_psi_a_epi16(psi_r_m3_p5, a_r_m3_p5, psi_i_m3_p5, a_i_m3_p5, psi_a_m3_p5);
    prodsum_psi_a_epi16(psi_r_m3_p3, a_r_m3_p3, psi_i_m3_p3, a_i_m3_p3, psi_a_m3_p3);
    prodsum_psi_a_epi16(psi_r_m3_p1, a_r_m3_p1, psi_i_m3_p1, a_i_m3_p1, psi_a_m3_p1);
    prodsum_psi_a_epi16(psi_r_m3_m1, a_r_m3_m1, psi_i_m3_m1, a_i_m3_m1, psi_a_m3_m1);
    prodsum_psi_a_epi16(psi_r_m3_m3, a_r_m3_m3, psi_i_m3_m3, a_i_m3_m3, psi_a_m3_m3);
    prodsum_psi_a_epi16(psi_r_m3_m5, a_r_m3_m5, psi_i_m3_m5, a_i_m3_m5, psi_a_m3_m5);
    prodsum_psi_a_epi16(psi_r_m3_m7, a_r_m3_m7, psi_i_m3_m7, a_i_m3_m7, psi_a_m3_m7);
    prodsum_psi_a_epi16(psi_r_m5_p7, a_r_m5_p7, psi_i_m5_p7, a_i_m5_p7, psi_a_m5_p7);
    prodsum_psi_a_epi16(psi_r_m5_p5, a_r_m5_p5, psi_i_m5_p5, a_i_m5_p5, psi_a_m5_p5);
    prodsum_psi_a_epi16(psi_r_m5_p3, a_r_m5_p3, psi_i_m5_p3, a_i_m5_p3, psi_a_m5_p3);
    prodsum_psi_a_epi16(psi_r_m5_p1, a_r_m5_p1, psi_i_m5_p1, a_i_m5_p1, psi_a_m5_p1);
    prodsum_psi_a_epi16(psi_r_m5_m1, a_r_m5_m1, psi_i_m5_m1, a_i_m5_m1, psi_a_m5_m1);
    prodsum_psi_a_epi16(psi_r_m5_m3, a_r_m5_m3, psi_i_m5_m3, a_i_m5_m3, psi_a_m5_m3);
    prodsum_psi_a_epi16(psi_r_m5_m5, a_r_m5_m5, psi_i_m5_m5, a_i_m5_m5, psi_a_m5_m5);
    prodsum_psi_a_epi16(psi_r_m5_m7, a_r_m5_m7, psi_i_m5_m7, a_i_m5_m7, psi_a_m5_m7);
    prodsum_psi_a_epi16(psi_r_m7_p7, a_r_m7_p7, psi_i_m7_p7, a_i_m7_p7, psi_a_m7_p7);
    prodsum_psi_a_epi16(psi_r_m7_p5, a_r_m7_p5, psi_i_m7_p5, a_i_m7_p5, psi_a_m7_p5);
    prodsum_psi_a_epi16(psi_r_m7_p3, a_r_m7_p3, psi_i_m7_p3, a_i_m7_p3, psi_a_m7_p3);
    prodsum_psi_a_epi16(psi_r_m7_p1, a_r_m7_p1, psi_i_m7_p1, a_i_m7_p1, psi_a_m7_p1);
    prodsum_psi_a_epi16(psi_r_m7_m1, a_r_m7_m1, psi_i_m7_m1, a_i_m7_m1, psi_a_m7_m1);
    prodsum_psi_a_epi16(psi_r_m7_m3, a_r_m7_m3, psi_i_m7_m3, a_i_m7_m3, psi_a_m7_m3);
    prodsum_psi_a_epi16(psi_r_m7_m5, a_r_m7_m5, psi_i_m7_m5, a_i_m7_m5, psi_a_m7_m5);
    prodsum_psi_a_epi16(psi_r_m7_m7, a_r_m7_m7, psi_i_m7_m7, a_i_m7_m7, psi_a_m7_m7);

    // Calculation of a group of two terms in the bit metric involving squares of interference
    square_a_epi16(a_r_p7_p7, a_i_p7_p7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p7_p7);
    square_a_epi16(a_r_p7_p5, a_i_p7_p5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p7_p5);
    square_a_epi16(a_r_p7_p3, a_i_p7_p3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p7_p3);
    square_a_epi16(a_r_p7_p1, a_i_p7_p1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p7_p1);
    square_a_epi16(a_r_p7_m1, a_i_p7_m1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p7_m1);
    square_a_epi16(a_r_p7_m3, a_i_p7_m3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p7_m3);
    square_a_epi16(a_r_p7_m5, a_i_p7_m5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p7_m5);
    square_a_epi16(a_r_p7_m7, a_i_p7_m7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p7_m7);
    square_a_epi16(a_r_p5_p7, a_i_p5_p7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p5_p7);
    square_a_epi16(a_r_p5_p5, a_i_p5_p5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p5_p5);
    square_a_epi16(a_r_p5_p3, a_i_p5_p3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p5_p3);
    square_a_epi16(a_r_p5_p1, a_i_p5_p1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p5_p1);
    square_a_epi16(a_r_p5_m1, a_i_p5_m1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p5_m1);
    square_a_epi16(a_r_p5_m3, a_i_p5_m3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p5_m3);
    square_a_epi16(a_r_p5_m5, a_i_p5_m5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p5_m5);
    square_a_epi16(a_r_p5_m7, a_i_p5_m7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p5_m7);
    square_a_epi16(a_r_p3_p7, a_i_p3_p7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p3_p7);
    square_a_epi16(a_r_p3_p5, a_i_p3_p5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p3_p5);
    square_a_epi16(a_r_p3_p3, a_i_p3_p3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p3_p3);
    square_a_epi16(a_r_p3_p1, a_i_p3_p1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p3_p1);
    square_a_epi16(a_r_p3_m1, a_i_p3_m1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p3_m1);
    square_a_epi16(a_r_p3_m3, a_i_p3_m3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p3_m3);
    square_a_epi16(a_r_p3_m5, a_i_p3_m5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p3_m5);
    square_a_epi16(a_r_p3_m7, a_i_p3_m7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p3_m7);
    square_a_epi16(a_r_p1_p7, a_i_p1_p7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p1_p7);
    square_a_epi16(a_r_p1_p5, a_i_p1_p5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p1_p5);
    square_a_epi16(a_r_p1_p3, a_i_p1_p3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p1_p3);
    square_a_epi16(a_r_p1_p1, a_i_p1_p1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p1_p1);
    square_a_epi16(a_r_p1_m1, a_i_p1_m1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p1_m1);
    square_a_epi16(a_r_p1_m3, a_i_p1_m3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p1_m3);
    square_a_epi16(a_r_p1_m5, a_i_p1_m5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p1_m5);
    square_a_epi16(a_r_p1_m7, a_i_p1_m7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_p1_m7);
    square_a_epi16(a_r_m1_p7, a_i_m1_p7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m1_p7);
    square_a_epi16(a_r_m1_p5, a_i_m1_p5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m1_p5);
    square_a_epi16(a_r_m1_p3, a_i_m1_p3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m1_p3);
    square_a_epi16(a_r_m1_p1, a_i_m1_p1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m1_p1);
    square_a_epi16(a_r_m1_m1, a_i_m1_m1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m1_m1);
    square_a_epi16(a_r_m1_m3, a_i_m1_m3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m1_m3);
    square_a_epi16(a_r_m1_m5, a_i_m1_m5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m1_m5);
    square_a_epi16(a_r_m1_m7, a_i_m1_m7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m1_m7);
    square_a_epi16(a_r_m3_p7, a_i_m3_p7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m3_p7);
    square_a_epi16(a_r_m3_p5, a_i_m3_p5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m3_p5);
    square_a_epi16(a_r_m3_p3, a_i_m3_p3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m3_p3);
    square_a_epi16(a_r_m3_p1, a_i_m3_p1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m3_p1);
    square_a_epi16(a_r_m3_m1, a_i_m3_m1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m3_m1);
    square_a_epi16(a_r_m3_m3, a_i_m3_m3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m3_m3);
    square_a_epi16(a_r_m3_m5, a_i_m3_m5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m3_m5);
    square_a_epi16(a_r_m3_m7, a_i_m3_m7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m3_m7);
    square_a_epi16(a_r_m5_p7, a_i_m5_p7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m5_p7);
    square_a_epi16(a_r_m5_p5, a_i_m5_p5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m5_p5);
    square_a_epi16(a_r_m5_p3, a_i_m5_p3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m5_p3);
    square_a_epi16(a_r_m5_p1, a_i_m5_p1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m5_p1);
    square_a_epi16(a_r_m5_m1, a_i_m5_m1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m5_m1);
    square_a_epi16(a_r_m5_m3, a_i_m5_m3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m5_m3);
    square_a_epi16(a_r_m5_m5, a_i_m5_m5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m5_m5);
    square_a_epi16(a_r_m5_m7, a_i_m5_m7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m5_m7);
    square_a_epi16(a_r_m7_p7, a_i_m7_p7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m7_p7);
    square_a_epi16(a_r_m7_p5, a_i_m7_p5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m7_p5);
    square_a_epi16(a_r_m7_p3, a_i_m7_p3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m7_p3);
    square_a_epi16(a_r_m7_p1, a_i_m7_p1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m7_p1);
    square_a_epi16(a_r_m7_m1, a_i_m7_m1, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m7_m1);
    square_a_epi16(a_r_m7_m3, a_i_m7_m3, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m7_m3);
    square_a_epi16(a_r_m7_m5, a_i_m7_m5, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m7_m5);
    square_a_epi16(a_r_m7_m7, a_i_m7_m7, ch_mag_int, SQRT_10_OVER_FOUR, a_sq_m7_m7);

    // Computing different multiples of ||h0||^2
    // x=1, y=1
    ch_mag_2_over_42_with_sigma2 = _mm_mulhi_epi16(ch_mag_des,ONE_OVER_FOUR_SQRT_42);
    ch_mag_2_over_42_with_sigma2 = _mm_slli_epi16(ch_mag_2_over_42_with_sigma2,1);
    // x=1, y=3
    ch_mag_10_over_42_with_sigma2 = _mm_mulhi_epi16(ch_mag_des,FIVE_OVER_FOUR_SQRT_42);
    ch_mag_10_over_42_with_sigma2 = _mm_slli_epi16(ch_mag_10_over_42_with_sigma2,1);
    // x=1, x=5
    ch_mag_26_over_42_with_sigma2 = _mm_mulhi_epi16(ch_mag_des,THIRTEEN_OVER_FOUR_SQRT_42);
    ch_mag_26_over_42_with_sigma2 = _mm_slli_epi16(ch_mag_26_over_42_with_sigma2,1);
    // x=1, y=7
    ch_mag_50_over_42_with_sigma2 = _mm_mulhi_epi16(ch_mag_des,TWENTYFIVE_OVER_FOUR_SQRT_42);
    ch_mag_50_over_42_with_sigma2 = _mm_slli_epi16(ch_mag_50_over_42_with_sigma2,1);
    // x=3, y=3
    ch_mag_18_over_42_with_sigma2 = _mm_mulhi_epi16(ch_mag_des,NINE_OVER_FOUR_SQRT_42);
    ch_mag_18_over_42_with_sigma2 = _mm_slli_epi16(ch_mag_18_over_42_with_sigma2,1);
    // x=3, y=5
    ch_mag_34_over_42_with_sigma2 = _mm_mulhi_epi16(ch_mag_des,SEVENTEEN_OVER_FOUR_SQRT_42);
    ch_mag_34_over_42_with_sigma2 = _mm_slli_epi16(ch_mag_34_over_42_with_sigma2,1);
    // x=3, y=7
    ch_mag_58_over_42_with_sigma2 = _mm_mulhi_epi16(ch_mag_des,TWENTYNINE_OVER_FOUR_SQRT_42);
    ch_mag_58_over_42_with_sigma2 = _mm_slli_epi16(ch_mag_58_over_42_with_sigma2,2);
    // x=5, y=5
    ch_mag_50_over_42_with_sigma2 = _mm_mulhi_epi16(ch_mag_des,TWENTYFIVE_OVER_FOUR_SQRT_42);
    ch_mag_50_over_42_with_sigma2 = _mm_slli_epi16(ch_mag_50_over_42_with_sigma2,1);
    // x=5, y=7
    ch_mag_74_over_42_with_sigma2 = _mm_mulhi_epi16(ch_mag_des,THIRTYSEVEN_OVER_FOUR_SQRT_42);
    ch_mag_74_over_42_with_sigma2 = _mm_slli_epi16(ch_mag_74_over_42_with_sigma2,2);
    // x=7, y=7
    ch_mag_98_over_42_with_sigma2 = _mm_mulhi_epi16(ch_mag_des,FORTYNINE_OVER_FOUR_SQRT_42);
    ch_mag_98_over_42_with_sigma2 = _mm_slli_epi16(ch_mag_98_over_42_with_sigma2,2);

    // Computing Metrics
    xmm0 = _mm_subs_epi16(psi_a_p7_p7, a_sq_p7_p7);
    xmm1 = _mm_adds_epi16(xmm0, y0_p_7_7);
    bit_met_p7_p7 = _mm_subs_epi16(xmm1, ch_mag_98_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p7_p5, a_sq_p7_p5);
    xmm1 = _mm_adds_epi16(xmm0, y0_p_7_5);
    bit_met_p7_p5 = _mm_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p7_p3, a_sq_p7_p3);
    xmm1 = _mm_adds_epi16(xmm0, y0_p_7_3);
    bit_met_p7_p3 = _mm_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p7_p1, a_sq_p7_p1);
    xmm1 = _mm_adds_epi16(xmm0, y0_p_7_1);
    bit_met_p7_p1 = _mm_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p7_m1, a_sq_p7_m1);
    xmm1 = _mm_adds_epi16(xmm0, y0_m_7_1);
    bit_met_p7_m1 = _mm_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p7_m3, a_sq_p7_m3);
    xmm1 = _mm_adds_epi16(xmm0, y0_m_7_3);
    bit_met_p7_m3 = _mm_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p7_m5, a_sq_p7_m5);
    xmm1 = _mm_adds_epi16(xmm0, y0_m_7_5);
    bit_met_p7_m5 = _mm_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p7_m7, a_sq_p7_m7);
    xmm1 = _mm_adds_epi16(xmm0, y0_m_7_7);
    bit_met_p7_m7 = _mm_subs_epi16(xmm1, ch_mag_98_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p5_p7, a_sq_p5_p7);
    xmm1 = _mm_adds_epi16(xmm0, y0_p_5_7);
    bit_met_p5_p7 = _mm_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p5_p5, a_sq_p5_p5);
    xmm1 = _mm_adds_epi16(xmm0, y0_p_5_5);
    bit_met_p5_p5 = _mm_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p5_p3, a_sq_p5_p3);
    xmm1 = _mm_adds_epi16(xmm0, y0_p_5_3);
    bit_met_p5_p3 = _mm_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p5_p1, a_sq_p5_p1);
    xmm1 = _mm_adds_epi16(xmm0, y0_p_5_1);
    bit_met_p5_p1 = _mm_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p5_m1, a_sq_p5_m1);
    xmm1 = _mm_adds_epi16(xmm0, y0_m_5_1);
    bit_met_p5_m1 = _mm_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p5_m3, a_sq_p5_m3);
    xmm1 = _mm_adds_epi16(xmm0, y0_m_5_3);
    bit_met_p5_m3 = _mm_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p5_m5, a_sq_p5_m5);
    xmm1 = _mm_adds_epi16(xmm0, y0_m_5_5);
    bit_met_p5_m5 = _mm_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p5_m7, a_sq_p5_m7);
    xmm1 = _mm_adds_epi16(xmm0, y0_m_5_7);
    bit_met_p5_m7 = _mm_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p3_p7, a_sq_p3_p7);
    xmm1 = _mm_adds_epi16(xmm0, y0_p_3_7);
    bit_met_p3_p7 = _mm_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p3_p5, a_sq_p3_p5);
    xmm1 = _mm_adds_epi16(xmm0, y0_p_3_5);
    bit_met_p3_p5 = _mm_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p3_p3, a_sq_p3_p3);
    xmm1 = _mm_adds_epi16(xmm0, y0_p_3_3);
    bit_met_p3_p3 = _mm_subs_epi16(xmm1, ch_mag_18_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p3_p1, a_sq_p3_p1);
    xmm1 = _mm_adds_epi16(xmm0, y0_p_3_1);
    bit_met_p3_p1 = _mm_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p3_m1, a_sq_p3_m1);
    xmm1 = _mm_adds_epi16(xmm0, y0_m_3_1);
    bit_met_p3_m1 = _mm_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p3_m3, a_sq_p3_m3);
    xmm1 = _mm_adds_epi16(xmm0, y0_m_3_3);
    bit_met_p3_m3 = _mm_subs_epi16(xmm1, ch_mag_18_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p3_m5, a_sq_p3_m5);
    xmm1 = _mm_adds_epi16(xmm0, y0_m_3_5);
    bit_met_p3_m5 = _mm_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p3_m7, a_sq_p3_m7);
    xmm1 = _mm_adds_epi16(xmm0, y0_m_3_7);
    bit_met_p3_m7 = _mm_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p1_p7, a_sq_p1_p7);
    xmm1 = _mm_adds_epi16(xmm0, y0_p_1_7);
    bit_met_p1_p7 = _mm_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p1_p5, a_sq_p1_p5);
    xmm1 = _mm_adds_epi16(xmm0, y0_p_1_5);
    bit_met_p1_p5 = _mm_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p1_p3, a_sq_p1_p3);
    xmm1 = _mm_adds_epi16(xmm0, y0_p_1_3);
    bit_met_p1_p3 = _mm_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p1_p1, a_sq_p1_p1);
    xmm1 = _mm_adds_epi16(xmm0, y0_p_1_1);
    bit_met_p1_p1 = _mm_subs_epi16(xmm1, ch_mag_2_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p1_m1, a_sq_p1_m1);
    xmm1 = _mm_adds_epi16(xmm0, y0_m_1_1);
    bit_met_p1_m1 = _mm_subs_epi16(xmm1, ch_mag_2_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p1_m3, a_sq_p1_m3);
    xmm1 = _mm_adds_epi16(xmm0, y0_m_1_3);
    bit_met_p1_m3 = _mm_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p1_m5, a_sq_p1_m5);
    xmm1 = _mm_adds_epi16(xmm0, y0_m_1_5);
    bit_met_p1_m5 = _mm_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p1_m7, a_sq_p1_m7);
    xmm1 = _mm_adds_epi16(xmm0, y0_m_1_7);
    bit_met_p1_m7 = _mm_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);

    xmm0 = _mm_subs_epi16(psi_a_m1_p7, a_sq_m1_p7);
    xmm1 = _mm_subs_epi16(xmm0, y0_m_1_7);
    bit_met_m1_p7 = _mm_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m1_p5, a_sq_m1_p5);
    xmm1 = _mm_subs_epi16(xmm0, y0_m_1_5);
    bit_met_m1_p5 = _mm_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m1_p3, a_sq_m1_p3);
    xmm1 = _mm_subs_epi16(xmm0, y0_m_1_3);
    bit_met_m1_p3 = _mm_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m1_p1, a_sq_m1_p1);
    xmm1 = _mm_subs_epi16(xmm0, y0_m_1_1);
    bit_met_m1_p1 = _mm_subs_epi16(xmm1, ch_mag_2_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m1_m1, a_sq_m1_m1);
    xmm1 = _mm_subs_epi16(xmm0, y0_p_1_1);
    bit_met_m1_m1 = _mm_subs_epi16(xmm1, ch_mag_2_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m1_m3, a_sq_m1_m3);
    xmm1 = _mm_subs_epi16(xmm0, y0_p_1_3);
    bit_met_m1_m3 = _mm_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m1_m5, a_sq_m1_m5);
    xmm1 = _mm_subs_epi16(xmm0, y0_p_1_5);
    bit_met_m1_m5 = _mm_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m1_m7, a_sq_m1_m7);
    xmm1 = _mm_subs_epi16(xmm0, y0_p_1_7);
    bit_met_m1_m7 = _mm_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m3_p7, a_sq_m3_p7);
    xmm1 = _mm_subs_epi16(xmm0, y0_m_3_7);
    bit_met_m3_p7 = _mm_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m3_p5, a_sq_m3_p5);
    xmm1 = _mm_subs_epi16(xmm0, y0_m_3_5);
    bit_met_m3_p5 = _mm_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m3_p3, a_sq_m3_p3);
    xmm1 = _mm_subs_epi16(xmm0, y0_m_3_3);
    bit_met_m3_p3 = _mm_subs_epi16(xmm1, ch_mag_18_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m3_p1, a_sq_m3_p1);
    xmm1 = _mm_subs_epi16(xmm0, y0_m_3_1);
    bit_met_m3_p1 = _mm_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m3_m1, a_sq_m3_m1);
    xmm1 = _mm_subs_epi16(xmm0, y0_p_3_1);
    bit_met_m3_m1 = _mm_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m3_m3, a_sq_m3_m3);
    xmm1 = _mm_subs_epi16(xmm0, y0_p_3_3);
    bit_met_m3_m3 = _mm_subs_epi16(xmm1, ch_mag_18_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m3_m5, a_sq_m3_m5);
    xmm1 = _mm_subs_epi16(xmm0, y0_p_3_5);
    bit_met_m3_m5 = _mm_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m3_m7, a_sq_m3_m7);
    xmm1 = _mm_subs_epi16(xmm0, y0_p_3_7);
    bit_met_m3_m7 = _mm_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m5_p7, a_sq_m5_p7);
    xmm1 = _mm_subs_epi16(xmm0, y0_m_5_7);
    bit_met_m5_p7 = _mm_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m5_p5, a_sq_m5_p5);
    xmm1 = _mm_subs_epi16(xmm0, y0_m_5_5);
    bit_met_m5_p5 = _mm_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m5_p3, a_sq_m5_p3);
    xmm1 = _mm_subs_epi16(xmm0, y0_m_5_3);
    bit_met_m5_p3 = _mm_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m5_p1, a_sq_m5_p1);
    xmm1 = _mm_subs_epi16(xmm0, y0_m_5_1);
    bit_met_m5_p1 = _mm_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m5_m1, a_sq_m5_m1);
    xmm1 = _mm_subs_epi16(xmm0, y0_p_5_1);
    bit_met_m5_m1 = _mm_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m5_m3, a_sq_m5_m3);
    xmm1 = _mm_subs_epi16(xmm0, y0_p_5_3);
    bit_met_m5_m3 = _mm_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m5_m5, a_sq_m5_m5);
    xmm1 = _mm_subs_epi16(xmm0, y0_p_5_5);
    bit_met_m5_m5 = _mm_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m5_m7, a_sq_m5_m7);
    xmm1 = _mm_subs_epi16(xmm0, y0_p_5_7);
    bit_met_m5_m7 = _mm_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m7_p7, a_sq_m7_p7);
    xmm1 = _mm_subs_epi16(xmm0, y0_m_7_7);
    bit_met_m7_p7 = _mm_subs_epi16(xmm1, ch_mag_98_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m7_p5, a_sq_m7_p5);
    xmm1 = _mm_subs_epi16(xmm0, y0_m_7_5);
    bit_met_m7_p5 = _mm_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m7_p3, a_sq_m7_p3);
    xmm1 = _mm_subs_epi16(xmm0, y0_m_7_3);
    bit_met_m7_p3 = _mm_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m7_p1, a_sq_m7_p1);
    xmm1 = _mm_subs_epi16(xmm0, y0_m_7_1);
    bit_met_m7_p1 = _mm_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m7_m1, a_sq_m7_m1);
    xmm1 = _mm_subs_epi16(xmm0, y0_p_7_1);
    bit_met_m7_m1 = _mm_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m7_m3, a_sq_m7_m3);
    xmm1 = _mm_subs_epi16(xmm0, y0_p_7_3);
    bit_met_m7_m3 = _mm_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m7_m5, a_sq_m7_m5);
    xmm1 = _mm_subs_epi16(xmm0, y0_p_7_5);
    bit_met_m7_m5 = _mm_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m7_m7, a_sq_m7_m7);
    xmm1 = _mm_subs_epi16(xmm0, y0_p_7_7);
    bit_met_m7_m7 = _mm_subs_epi16(xmm1, ch_mag_98_over_42_with_sigma2);

    // Detection for 1st bit (LTE mapping)
    // bit = 1
    xmm0 = _mm_max_epi16(bit_met_m7_p7, bit_met_m7_p5);
    xmm1 = _mm_max_epi16(bit_met_m7_p3, bit_met_m7_p1);
    xmm2 = _mm_max_epi16(bit_met_m7_m1, bit_met_m7_m3);
    xmm3 = _mm_max_epi16(bit_met_m7_m5, bit_met_m7_m7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(xmm4, xmm5);
    xmm0 = _mm_max_epi16(bit_met_m5_p7, bit_met_m5_p5);
    xmm1 = _mm_max_epi16(bit_met_m5_p3, bit_met_m5_p1);
    xmm2 = _mm_max_epi16(bit_met_m5_m1, bit_met_m5_m3);
    xmm3 = _mm_max_epi16(bit_met_m5_m5, bit_met_m5_m7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_m3_p7, bit_met_m3_p5);
    xmm1 = _mm_max_epi16(bit_met_m3_p3, bit_met_m3_p1);
    xmm2 = _mm_max_epi16(bit_met_m3_m1, bit_met_m3_m3);
    xmm3 = _mm_max_epi16(bit_met_m3_m5, bit_met_m3_m7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_m1_p7, bit_met_m1_p5);
    xmm1 = _mm_max_epi16(bit_met_m1_p3, bit_met_m1_p1);
    xmm2 = _mm_max_epi16(bit_met_m1_m1, bit_met_m1_m3);
    xmm3 = _mm_max_epi16(bit_met_m1_m5, bit_met_m1_m7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);

    // bit = 0
    xmm0 = _mm_max_epi16(bit_met_p7_p7, bit_met_p7_p5);
    xmm1 = _mm_max_epi16(bit_met_p7_p3, bit_met_p7_p1);
    xmm2 = _mm_max_epi16(bit_met_p7_m1, bit_met_p7_m3);
    xmm3 = _mm_max_epi16(bit_met_p7_m5, bit_met_p7_m7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(xmm4, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p5_p7, bit_met_p5_p5);
    xmm1 = _mm_max_epi16(bit_met_p5_p3, bit_met_p5_p1);
    xmm2 = _mm_max_epi16(bit_met_p5_m1, bit_met_p5_m3);
    xmm3 = _mm_max_epi16(bit_met_p5_m5, bit_met_p5_m7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p3_p7, bit_met_p3_p5);
    xmm1 = _mm_max_epi16(bit_met_p3_p3, bit_met_p3_p1);
    xmm2 = _mm_max_epi16(bit_met_p3_m1, bit_met_p3_m3);
    xmm3 = _mm_max_epi16(bit_met_p3_m5, bit_met_p3_m7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p1_p7, bit_met_p1_p5);
    xmm1 = _mm_max_epi16(bit_met_p1_p3, bit_met_p1_p1);
    xmm2 = _mm_max_epi16(bit_met_p1_m1, bit_met_p1_m3);
    xmm3 = _mm_max_epi16(bit_met_p1_m5, bit_met_p1_m7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);

    y0r = _mm_subs_epi16(logmax_num_re0, logmax_den_re0);

    // Detection for 2nd bit (LTE mapping)
    // bit = 1
    xmm0 = _mm_max_epi16(bit_met_p7_m1, bit_met_p5_m1);
    xmm1 = _mm_max_epi16(bit_met_p3_m1, bit_met_p1_m1);
    xmm2 = _mm_max_epi16(bit_met_m1_m1, bit_met_m3_m1);
    xmm3 = _mm_max_epi16(bit_met_m5_m1, bit_met_m7_m1);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(xmm4, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_m3, bit_met_p5_m3);
    xmm1 = _mm_max_epi16(bit_met_p3_m3, bit_met_p1_m3);
    xmm2 = _mm_max_epi16(bit_met_m1_m3, bit_met_m3_m3);
    xmm3 = _mm_max_epi16(bit_met_m5_m3, bit_met_m7_m3);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_m5, bit_met_p5_m5);
    xmm1 = _mm_max_epi16(bit_met_p3_m5, bit_met_p1_m5);
    xmm2 = _mm_max_epi16(bit_met_m1_m5, bit_met_m3_m5);
    xmm3 = _mm_max_epi16(bit_met_m5_m5, bit_met_m7_m5);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_m7, bit_met_p5_m7);
    xmm1 = _mm_max_epi16(bit_met_p3_m7, bit_met_p1_m7);
    xmm2 = _mm_max_epi16(bit_met_m1_m7, bit_met_m3_m7);
    xmm3 = _mm_max_epi16(bit_met_m5_m7, bit_met_m7_m7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);

    // bit = 0
    xmm0 = _mm_max_epi16(bit_met_p7_p1, bit_met_p5_p1);
    xmm1 = _mm_max_epi16(bit_met_p3_p1, bit_met_p1_p1);
    xmm2 = _mm_max_epi16(bit_met_m1_p1, bit_met_m3_p1);
    xmm3 = _mm_max_epi16(bit_met_m5_p1, bit_met_m7_p1);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(xmm4, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_p3, bit_met_p5_p3);
    xmm1 = _mm_max_epi16(bit_met_p3_p3, bit_met_p1_p3);
    xmm2 = _mm_max_epi16(bit_met_m1_p3, bit_met_m3_p3);
    xmm3 = _mm_max_epi16(bit_met_m5_p3, bit_met_m7_p3);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_p5, bit_met_p5_p5);
    xmm1 = _mm_max_epi16(bit_met_p3_p5, bit_met_p1_p5);
    xmm2 = _mm_max_epi16(bit_met_m1_p5, bit_met_m3_p5);
    xmm3 = _mm_max_epi16(bit_met_m5_p5, bit_met_m7_p5);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_p7, bit_met_p5_p7);
    xmm1 = _mm_max_epi16(bit_met_p3_p7, bit_met_p1_p7);
    xmm2 = _mm_max_epi16(bit_met_m1_p7, bit_met_m3_p7);
    xmm3 = _mm_max_epi16(bit_met_m5_p7, bit_met_m7_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);

    y1r = _mm_subs_epi16(logmax_num_re0, logmax_den_re0);

    // Detection for 3rd bit (LTE mapping)
    xmm0 = _mm_max_epi16(bit_met_m7_m7, bit_met_m7_m5);
    xmm1 = _mm_max_epi16(bit_met_m7_m3, bit_met_m7_m1);
    xmm2 = _mm_max_epi16(bit_met_m7_p1, bit_met_m7_p3);
    xmm3 = _mm_max_epi16(bit_met_m7_p5, bit_met_m7_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(xmm4, xmm5);
    xmm0 = _mm_max_epi16(bit_met_m5_m7, bit_met_m5_m5);
    xmm1 = _mm_max_epi16(bit_met_m5_m3, bit_met_m5_m1);
    xmm2 = _mm_max_epi16(bit_met_m5_p1, bit_met_m5_p3);
    xmm3 = _mm_max_epi16(bit_met_m5_p5, bit_met_m5_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p5_m7, bit_met_p5_m5);
    xmm1 = _mm_max_epi16(bit_met_p5_m3, bit_met_p5_m1);
    xmm2 = _mm_max_epi16(bit_met_p5_p1, bit_met_p5_p3);
    xmm3 = _mm_max_epi16(bit_met_p5_p5, bit_met_p5_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_m7, bit_met_p7_m5);
    xmm1 = _mm_max_epi16(bit_met_p7_m3, bit_met_p7_m1);
    xmm2 = _mm_max_epi16(bit_met_p7_p1, bit_met_p7_p3);
    xmm3 = _mm_max_epi16(bit_met_p7_p5, bit_met_p7_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);

    xmm0 = _mm_max_epi16(bit_met_m3_m7, bit_met_m3_m5);
    xmm1 = _mm_max_epi16(bit_met_m3_m3, bit_met_m3_m1);
    xmm2 = _mm_max_epi16(bit_met_m3_p1, bit_met_m3_p3);
    xmm3 = _mm_max_epi16(bit_met_m3_p5, bit_met_m3_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(xmm4, xmm5);
    xmm0 = _mm_max_epi16(bit_met_m1_m7, bit_met_m1_m5);
    xmm1 = _mm_max_epi16(bit_met_m1_m3, bit_met_m1_m1);
    xmm2 = _mm_max_epi16(bit_met_m1_p1, bit_met_m1_p3);
    xmm3 = _mm_max_epi16(bit_met_m1_p5, bit_met_m1_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p1_m7, bit_met_p1_m5);
    xmm1 = _mm_max_epi16(bit_met_p1_m3, bit_met_p1_m1);
    xmm2 = _mm_max_epi16(bit_met_p1_p1, bit_met_p1_p3);
    xmm3 = _mm_max_epi16(bit_met_p1_p5, bit_met_p1_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p3_m7, bit_met_p3_m5);
    xmm1 = _mm_max_epi16(bit_met_p3_m3, bit_met_p3_m1);
    xmm2 = _mm_max_epi16(bit_met_p3_p1, bit_met_p3_p3);
    xmm3 = _mm_max_epi16(bit_met_p3_p5, bit_met_p3_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);

    y2r = _mm_subs_epi16(logmax_num_re0, logmax_den_re0);

    // Detection for 4th bit (LTE mapping)
    xmm0 = _mm_max_epi16(bit_met_p7_p7, bit_met_p5_p7);
    xmm1 = _mm_max_epi16(bit_met_p3_p7, bit_met_p1_p7);
    xmm2 = _mm_max_epi16(bit_met_m1_p7, bit_met_m3_p7);
    xmm3 = _mm_max_epi16(bit_met_m5_p7, bit_met_m7_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(xmm4, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_p5, bit_met_p5_p5);
    xmm1 = _mm_max_epi16(bit_met_p3_p5, bit_met_p1_p5);
    xmm2 = _mm_max_epi16(bit_met_m1_p5, bit_met_m3_p5);
    xmm3 = _mm_max_epi16(bit_met_m5_p5, bit_met_m5_p5);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_m5, bit_met_p5_m5);
    xmm1 = _mm_max_epi16(bit_met_p3_m5, bit_met_p1_m5);
    xmm2 = _mm_max_epi16(bit_met_m1_m5, bit_met_m3_m5);
    xmm3 = _mm_max_epi16(bit_met_m5_m5, bit_met_m7_m5);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_m7, bit_met_p5_m7);
    xmm1 = _mm_max_epi16(bit_met_p3_m7, bit_met_p1_m7);
    xmm2 = _mm_max_epi16(bit_met_m1_m7, bit_met_m3_m7);
    xmm3 = _mm_max_epi16(bit_met_m5_m7, bit_met_m7_m7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);

    xmm0 = _mm_max_epi16(bit_met_p7_m1, bit_met_p5_m1);
    xmm1 = _mm_max_epi16(bit_met_p3_m1, bit_met_p1_m1);
    xmm2 = _mm_max_epi16(bit_met_m1_m1, bit_met_m3_m1);
    xmm3 = _mm_max_epi16(bit_met_m5_m1, bit_met_m7_m1);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(xmm4, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_m3, bit_met_p5_m3);
    xmm1 = _mm_max_epi16(bit_met_p3_m3, bit_met_p1_m3);
    xmm2 = _mm_max_epi16(bit_met_m1_m3, bit_met_m3_m3);
    xmm3 = _mm_max_epi16(bit_met_m5_m3, bit_met_m7_m3);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_p1, bit_met_p5_p1);
    xmm1 = _mm_max_epi16(bit_met_p3_p1, bit_met_p1_p1);
    xmm2 = _mm_max_epi16(bit_met_m1_p1, bit_met_m3_p1);
    xmm3 = _mm_max_epi16(bit_met_m5_p1, bit_met_m7_p1);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_p3, bit_met_p5_p3);
    xmm1 = _mm_max_epi16(bit_met_p3_p3, bit_met_p1_p3);
    xmm2 = _mm_max_epi16(bit_met_m1_p3, bit_met_m3_p3);
    xmm3 = _mm_max_epi16(bit_met_m5_p3, bit_met_m7_p3);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);

    y0i = _mm_subs_epi16(logmax_num_re0, logmax_den_re0);


    // Detection for 5th bit (LTE mapping)
    xmm0 = _mm_max_epi16(bit_met_m7_m7, bit_met_m7_m5);
    xmm1 = _mm_max_epi16(bit_met_m7_m3, bit_met_m7_m1);
    xmm2 = _mm_max_epi16(bit_met_m7_p1, bit_met_m7_p3);
    xmm3 = _mm_max_epi16(bit_met_m7_p5, bit_met_m7_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(xmm4, xmm5);
    xmm0 = _mm_max_epi16(bit_met_m1_m7, bit_met_m1_m5);
    xmm1 = _mm_max_epi16(bit_met_m1_m3, bit_met_m1_m1);
    xmm2 = _mm_max_epi16(bit_met_m1_p1, bit_met_m1_p3);
    xmm3 = _mm_max_epi16(bit_met_m1_p5, bit_met_m1_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p1_m7, bit_met_p1_m5);
    xmm1 = _mm_max_epi16(bit_met_p1_m3, bit_met_p1_m1);
    xmm2 = _mm_max_epi16(bit_met_p1_p1, bit_met_p1_p3);
    xmm3 = _mm_max_epi16(bit_met_p1_p5, bit_met_p1_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_m7, bit_met_p7_m5);
    xmm1 = _mm_max_epi16(bit_met_p7_m3, bit_met_p7_m1);
    xmm2 = _mm_max_epi16(bit_met_p7_p1, bit_met_p7_p3);
    xmm3 = _mm_max_epi16(bit_met_p7_p5, bit_met_p7_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);

    xmm0 = _mm_max_epi16(bit_met_m5_m7, bit_met_m5_m5);
    xmm1 = _mm_max_epi16(bit_met_m5_m3, bit_met_m5_m1);
    xmm2 = _mm_max_epi16(bit_met_m5_p1, bit_met_m5_p3);
    xmm3 = _mm_max_epi16(bit_met_m5_p5, bit_met_m5_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(xmm4, xmm5);
    xmm0 = _mm_max_epi16(bit_met_m3_m7, bit_met_m3_m5);
    xmm1 = _mm_max_epi16(bit_met_m3_m3, bit_met_m3_m1);
    xmm2 = _mm_max_epi16(bit_met_m3_p1, bit_met_m3_p3);
    xmm3 = _mm_max_epi16(bit_met_m3_p5, bit_met_m3_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p3_m7, bit_met_p3_m5);
    xmm1 = _mm_max_epi16(bit_met_p3_m3, bit_met_p3_m1);
    xmm2 = _mm_max_epi16(bit_met_p3_p1, bit_met_p3_p3);
    xmm3 = _mm_max_epi16(bit_met_p3_p5, bit_met_p3_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p5_m7, bit_met_p5_m5);
    xmm1 = _mm_max_epi16(bit_met_p5_m3, bit_met_p5_m1);
    xmm2 = _mm_max_epi16(bit_met_p5_p1, bit_met_p5_p3);
    xmm3 = _mm_max_epi16(bit_met_p5_p5, bit_met_p5_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);

    y1i = _mm_subs_epi16(logmax_num_re0, logmax_den_re0);

    // Detection for 6th bit (LTE mapping)
    xmm0 = _mm_max_epi16(bit_met_p7_p7, bit_met_p5_p7);
    xmm1 = _mm_max_epi16(bit_met_p3_p7, bit_met_p1_p7);
    xmm2 = _mm_max_epi16(bit_met_m1_p7, bit_met_m3_p7);
    xmm3 = _mm_max_epi16(bit_met_m5_p7, bit_met_m7_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(xmm4, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_p1, bit_met_p5_p1);
    xmm1 = _mm_max_epi16(bit_met_p3_p1, bit_met_p1_p1);
    xmm2 = _mm_max_epi16(bit_met_m1_p1, bit_met_m3_p1);
    xmm3 = _mm_max_epi16(bit_met_m5_p1, bit_met_m5_p1);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_m1, bit_met_p5_m1);
    xmm1 = _mm_max_epi16(bit_met_p3_m1, bit_met_p1_m1);
    xmm2 = _mm_max_epi16(bit_met_m1_m1, bit_met_m3_m1);
    xmm3 = _mm_max_epi16(bit_met_m5_m1, bit_met_m7_m1);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_m7, bit_met_p5_m7);
    xmm1 = _mm_max_epi16(bit_met_p3_m7, bit_met_p1_m7);
    xmm2 = _mm_max_epi16(bit_met_m1_m7, bit_met_m3_m7);
    xmm3 = _mm_max_epi16(bit_met_m5_m7, bit_met_m7_m7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);

    xmm0 = _mm_max_epi16(bit_met_p7_m5, bit_met_p5_m5);
    xmm1 = _mm_max_epi16(bit_met_p3_m5, bit_met_p1_m5);
    xmm2 = _mm_max_epi16(bit_met_m1_m5, bit_met_m3_m5);
    xmm3 = _mm_max_epi16(bit_met_m5_m5, bit_met_m7_m5);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(xmm4, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_m3, bit_met_p5_m3);
    xmm1 = _mm_max_epi16(bit_met_p3_m3, bit_met_p1_m3);
    xmm2 = _mm_max_epi16(bit_met_m1_m3, bit_met_m3_m3);
    xmm3 = _mm_max_epi16(bit_met_m5_m3, bit_met_m7_m3);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_p3, bit_met_p5_p3);
    xmm1 = _mm_max_epi16(bit_met_p3_p3, bit_met_p1_p3);
    xmm2 = _mm_max_epi16(bit_met_m1_p3, bit_met_m3_p3);
    xmm3 = _mm_max_epi16(bit_met_m5_p3, bit_met_m7_p3);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_p5, bit_met_p5_p5);
    xmm1 = _mm_max_epi16(bit_met_p3_p5, bit_met_p1_p5);
    xmm2 = _mm_max_epi16(bit_met_m1_p5, bit_met_m3_p5);
    xmm3 = _mm_max_epi16(bit_met_m5_p5, bit_met_m7_p5);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);

    y2i = _mm_subs_epi16(logmax_num_re0, logmax_den_re0);


    // map to output stream, difficult to do in SIMD since we have 6 16bit LLRs
    // RE 1
    j = 24*i;
    stream0_out[j + 0] = ((short *)&y0r)[0];
    stream0_out[j + 1] = ((short *)&y1r)[0];
    stream0_out[j + 2] = ((short *)&y2r)[0];
    stream0_out[j + 3] = ((short *)&y0i)[0];
    stream0_out[j + 4] = ((short *)&y1i)[0];
    stream0_out[j + 5] = ((short *)&y2i)[0];
    // RE 2
    stream0_out[j + 6] = ((short *)&y0r)[1];
    stream0_out[j + 7] = ((short *)&y1r)[1];
    stream0_out[j + 8] = ((short *)&y2r)[1];
    stream0_out[j + 9] = ((short *)&y0i)[1];
    stream0_out[j + 10] = ((short *)&y1i)[1];
    stream0_out[j + 11] = ((short *)&y2i)[1];
    // RE 3
    stream0_out[j + 12] = ((short *)&y0r)[2];
    stream0_out[j + 13] = ((short *)&y1r)[2];
    stream0_out[j + 14] = ((short *)&y2r)[2];
    stream0_out[j + 15] = ((short *)&y0i)[2];
    stream0_out[j + 16] = ((short *)&y1i)[2];
    stream0_out[j + 17] = ((short *)&y2i)[2];
    // RE 4
    stream0_out[j + 18] = ((short *)&y0r)[3];
    stream0_out[j + 19] = ((short *)&y1r)[3];
    stream0_out[j + 20] = ((short *)&y2r)[3];
    stream0_out[j + 21] = ((short *)&y0i)[3];
    stream0_out[j + 22] = ((short *)&y1i)[3];
    stream0_out[j + 23] = ((short *)&y2i)[3];
    // RE 5
    stream0_out[j + 24] = ((short *)&y0r)[4];
    stream0_out[j + 25] = ((short *)&y1r)[4];
    stream0_out[j + 26] = ((short *)&y2r)[4];
    stream0_out[j + 27] = ((short *)&y0i)[4];
    stream0_out[j + 28] = ((short *)&y1i)[4];
    stream0_out[j + 29] = ((short *)&y2i)[4];
    // RE 6
    stream0_out[j + 30] = ((short *)&y0r)[5];
    stream0_out[j + 31] = ((short *)&y1r)[5];
    stream0_out[j + 32] = ((short *)&y2r)[5];
    stream0_out[j + 33] = ((short *)&y0i)[5];
    stream0_out[j + 34] = ((short *)&y1i)[5];
    stream0_out[j + 35] = ((short *)&y2i)[5];
    // RE 7
    stream0_out[j + 36] = ((short *)&y0r)[6];
    stream0_out[j + 37] = ((short *)&y1r)[6];
    stream0_out[j + 38] = ((short *)&y2r)[6];
    stream0_out[j + 39] = ((short *)&y0i)[6];
    stream0_out[j + 40] = ((short *)&y1i)[6];
    stream0_out[j + 41] = ((short *)&y2i)[6];
    // RE 8
    stream0_out[j + 42] = ((short *)&y0r)[7];
    stream0_out[j + 43] = ((short *)&y1r)[7];
    stream0_out[j + 44] = ((short *)&y2r)[7];
    stream0_out[j + 45] = ((short *)&y0i)[7];
    stream0_out[j + 46] = ((short *)&y1i)[7];
    stream0_out[j + 47] = ((short *)&y2i)[7];

#elif defined(__arm__)

#endif
  }

#if defined(__x86_64__) || defined(__i386__)
  _mm_empty();
  _m_empty();
#endif

}


int nr_dlsch_64qam_16qam_llr(NR_DL_FRAME_PARMS *frame_parms,
                          int32_t **rxdataF_comp,
                          int32_t **rxdataF_comp_i,
                          int32_t **dl_ch_mag,
                          int32_t **dl_ch_mag_i,
                          int32_t **rho_i,
                          int16_t *dlsch_llr,
                          uint8_t symbol,
                          uint8_t first_symbol_flag,
                          uint16_t nb_rb,
                          uint16_t pbch_pss_sss_adjust,
                          int16_t **llr16p)
{

  int16_t *rxF      = (int16_t*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
  int16_t *rxF_i    = (int16_t*)&rxdataF_comp_i[0][(symbol*frame_parms->N_RB_DL*12)];
  int16_t *ch_mag   = (int16_t*)&dl_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)];
  int16_t *ch_mag_i = (int16_t*)&dl_ch_mag_i[0][(symbol*frame_parms->N_RB_DL*12)];
  int16_t *rho      = (int16_t*)&rho_i[0][(symbol*frame_parms->N_RB_DL*12)];
  int16_t *llr16;
  int len;
  uint8_t symbol_mod = (symbol >= (7-frame_parms->Ncp))? (symbol-(7-frame_parms->Ncp)) : symbol;

  //first symbol has different structure due to more pilots
  if (first_symbol_flag == 1) {
    llr16 = (int16_t*)dlsch_llr;
  } else {
    llr16 = (int16_t*)(*llr16p);
  }

  AssertFatal(llr16!=NULL,"nr_dlsch_16qam_64qam_llr:llr is null, symbol %d\n",symbol);

  if ((symbol_mod==0) || (symbol_mod==(4-frame_parms->Ncp))) {
    // if symbol has pilots
    if (frame_parms->nb_antenna_ports_gNB!=1)
      // in 2 antenna ports we have 8 REs per symbol per RB
      len = (nb_rb*8) - (2*pbch_pss_sss_adjust/3);
    else
      // for 1 antenna port we have 10 REs per symbol per RB
      len = (nb_rb*10) - (5*pbch_pss_sss_adjust/6);
  } else {
    // symbol has no pilots
    len = (nb_rb*12) - pbch_pss_sss_adjust;
  }

  nr_qam64_qam16((short *)rxF,
              (short *)rxF_i,
              (short *)ch_mag,
              (short *)ch_mag_i,
              (short *)llr16,
              (short *)rho,
              len);

  llr16 += (6*len);
  *llr16p = (short *)llr16;
  return(0);
}

#if 0
void qam64_qam64(short *stream0_in,
                 short *stream1_in,
                 short *ch_mag,
                 short *ch_mag_i,
                 short *stream0_out,
                 short *rho01,
                 int length
     )
{

  /*
    Author: S. Wagner
    Date: 31-07-12

    Input:
    stream0_in:  MF filter for 1st stream, i.e., y0=h0'*y
    stream1_in:  MF filter for 2nd stream, i.e., y1=h1'*y
    ch_mag:      4*h0/sqrt(42), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
    ch_mag_i:    4*h1/sqrt(42), [Re0 Im0 Re1 Im1] s.t. Im0=Re0, Im1=Re1, etc
    rho01:       Channel cross correlation, i.e., h1'*h0

    Output:
    stream0_out: output LLRs for 1st stream
  */

#if defined(__x86_64__) || defined(__i386__)

  __m128i *rho01_128i      = (__m128i *)rho01;
  __m128i *stream0_128i_in = (__m128i *)stream0_in;
  __m128i *stream1_128i_in = (__m128i *)stream1_in;
  __m128i *ch_mag_128i     = (__m128i *)ch_mag;
  __m128i *ch_mag_128i_i   = (__m128i *)ch_mag_i;

  __m128i ONE_OVER_SQRT_42 = _mm_set1_epi16(10112); // round(1/sqrt(42)*2^16)
  __m128i THREE_OVER_SQRT_42 = _mm_set1_epi16(30337); // round(3/sqrt(42)*2^16)
  __m128i FIVE_OVER_SQRT_42 = _mm_set1_epi16(25281); // round(5/sqrt(42)*2^15)
  __m128i SEVEN_OVER_SQRT_42 = _mm_set1_epi16(17697); // round(7/sqrt(42)*2^14) Q2.14
  __m128i ONE_OVER_SQRT_2 = _mm_set1_epi16(23170); // round(1/sqrt(2)*2^15)
  __m128i ONE_OVER_SQRT_2_42 = _mm_set1_epi16(3575); // round(1/sqrt(2*42)*2^15)
  __m128i THREE_OVER_SQRT_2_42 = _mm_set1_epi16(10726); // round(3/sqrt(2*42)*2^15)
  __m128i FIVE_OVER_SQRT_2_42 = _mm_set1_epi16(17876); // round(5/sqrt(2*42)*2^15)
  __m128i SEVEN_OVER_SQRT_2_42 = _mm_set1_epi16(25027); // round(7/sqrt(2*42)*2^15)
  __m128i FORTYNINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(30969); // round(49/(4*sqrt(42))*2^14), Q2.14
  __m128i THIRTYSEVEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(23385); // round(37/(4*sqrt(42))*2^14), Q2.14
  __m128i TWENTYFIVE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(31601); // round(25/(4*sqrt(42))*2^15)
  __m128i TWENTYNINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(18329); // round(29/(4*sqrt(42))*2^15), Q2.14
  __m128i SEVENTEEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(21489); // round(17/(4*sqrt(42))*2^15)
  __m128i NINE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(11376); // round(9/(4*sqrt(42))*2^15)
  __m128i THIRTEEN_OVER_FOUR_SQRT_42 = _mm_set1_epi16(16433); // round(13/(4*sqrt(42))*2^15)
  __m128i FIVE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(6320); // round(5/(4*sqrt(42))*2^15)
  __m128i ONE_OVER_FOUR_SQRT_42 = _mm_set1_epi16(1264); // round(1/(4*sqrt(42))*2^15)
  __m128i SQRT_42_OVER_FOUR = _mm_set1_epi16(13272); // round(sqrt(42)/4*2^13), Q3.12

  __m128i ch_mag_des;
  __m128i ch_mag_int;
  __m128i ch_mag_98_over_42_with_sigma2;
  __m128i ch_mag_74_over_42_with_sigma2;
  __m128i ch_mag_58_over_42_with_sigma2;
  __m128i ch_mag_50_over_42_with_sigma2;
  __m128i ch_mag_34_over_42_with_sigma2;
  __m128i ch_mag_18_over_42_with_sigma2;
  __m128i ch_mag_26_over_42_with_sigma2;
  __m128i ch_mag_10_over_42_with_sigma2;
  __m128i ch_mag_2_over_42_with_sigma2;
  __m128i  y0r_one_over_sqrt_21;
  __m128i  y0r_three_over_sqrt_21;
  __m128i  y0r_five_over_sqrt_21;
  __m128i  y0r_seven_over_sqrt_21;
  __m128i  y0i_one_over_sqrt_21;
  __m128i  y0i_three_over_sqrt_21;
  __m128i  y0i_five_over_sqrt_21;
  __m128i  y0i_seven_over_sqrt_21;
  __m128i ch_mag_int_with_sigma2;
  __m128i two_ch_mag_int_with_sigma2;
  __m128i three_ch_mag_int_with_sigma2;
#elif defined(__arm__)

#endif

  int i,j;


  for (i=0; i<length>>2; i+=2) {

#if defined(__x86_64__) || defined(__i386__)

    // Get rho
    xmm0 = rho01_128i[i];
    xmm1 = rho01_128i[i+1];
    xmm0 = _mm_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    //xmm0 = [Re(0,1) Re(2,3) Im(0,1) Im(2,3)]
    //xmm1 = [Re(4,5) Re(6,7) Im(4,5) Im(6,7)]
    xmm2 = _mm_unpacklo_epi64(xmm0,xmm1); // Re(rho)
    xmm3 = _mm_unpackhi_epi64(xmm0,xmm1); // Im(rho)
    rho_rpi = _mm_adds_epi16(xmm2,xmm3); // rho = Re(rho) + Im(rho)
    rho_rmi = _mm_subs_epi16(xmm2,xmm3); // rho* = Re(rho) - Im(rho)

    // Compute the different rhos
    rho_rpi_1_1 = _mm_mulhi_epi16(rho_rpi, ONE_OVER_SQRT_42);
    rho_rmi_1_1 = _mm_mulhi_epi16(rho_rmi, ONE_OVER_SQRT_42);
    rho_rpi_3_3 = _mm_mulhi_epi16(rho_rpi, THREE_OVER_SQRT_42);
    rho_rmi_3_3 = _mm_mulhi_epi16(rho_rmi, THREE_OVER_SQRT_42);
    rho_rpi_5_5 = _mm_mulhi_epi16(rho_rpi, FIVE_OVER_SQRT_42);
    rho_rmi_5_5 = _mm_mulhi_epi16(rho_rmi, FIVE_OVER_SQRT_42);
    rho_rpi_7_7 = _mm_mulhi_epi16(rho_rpi, SEVEN_OVER_SQRT_42);
    rho_rmi_7_7 = _mm_mulhi_epi16(rho_rmi, SEVEN_OVER_SQRT_42);

    rho_rpi_5_5 = _mm_slli_epi16(rho_rpi_5_5, 1);
    rho_rmi_5_5 = _mm_slli_epi16(rho_rmi_5_5, 1);
    rho_rpi_7_7 = _mm_slli_epi16(rho_rpi_7_7, 2);
    rho_rmi_7_7 = _mm_slli_epi16(rho_rmi_7_7, 2);

    xmm4 = _mm_mulhi_epi16(xmm2, ONE_OVER_SQRT_42);
    xmm5 = _mm_mulhi_epi16(xmm3, ONE_OVER_SQRT_42);
    xmm6 = _mm_mulhi_epi16(xmm3, THREE_OVER_SQRT_42);
    xmm7 = _mm_mulhi_epi16(xmm3, FIVE_OVER_SQRT_42);
    xmm8 = _mm_mulhi_epi16(xmm3, SEVEN_OVER_SQRT_42);
    xmm7 = _mm_slli_epi16(xmm7, 1);
    xmm8 = _mm_slli_epi16(xmm8, 2);

    rho_rpi_1_3 = _mm_adds_epi16(xmm4, xmm6);
    rho_rmi_1_3 = _mm_subs_epi16(xmm4, xmm6);
    rho_rpi_1_5 = _mm_adds_epi16(xmm4, xmm7);
    rho_rmi_1_5 = _mm_subs_epi16(xmm4, xmm7);
    rho_rpi_1_7 = _mm_adds_epi16(xmm4, xmm8);
    rho_rmi_1_7 = _mm_subs_epi16(xmm4, xmm8);

    xmm4 = _mm_mulhi_epi16(xmm2, THREE_OVER_SQRT_42);
    rho_rpi_3_1 = _mm_adds_epi16(xmm4, xmm5);
    rho_rmi_3_1 = _mm_subs_epi16(xmm4, xmm5);
    rho_rpi_3_5 = _mm_adds_epi16(xmm4, xmm7);
    rho_rmi_3_5 = _mm_subs_epi16(xmm4, xmm7);
    rho_rpi_3_7 = _mm_adds_epi16(xmm4, xmm8);
    rho_rmi_3_7 = _mm_subs_epi16(xmm4, xmm8);

    xmm4 = _mm_mulhi_epi16(xmm2, FIVE_OVER_SQRT_42);
    xmm4 = _mm_slli_epi16(xmm4, 1);
    rho_rpi_5_1 = _mm_adds_epi16(xmm4, xmm5);
    rho_rmi_5_1 = _mm_subs_epi16(xmm4, xmm5);
    rho_rpi_5_3 = _mm_adds_epi16(xmm4, xmm6);
    rho_rmi_5_3 = _mm_subs_epi16(xmm4, xmm6);
    rho_rpi_5_7 = _mm_adds_epi16(xmm4, xmm8);
    rho_rmi_5_7 = _mm_subs_epi16(xmm4, xmm8);

    xmm4 = _mm_mulhi_epi16(xmm2, SEVEN_OVER_SQRT_42);
    xmm4 = _mm_slli_epi16(xmm4, 2);
    rho_rpi_7_1 = _mm_adds_epi16(xmm4, xmm5);
    rho_rmi_7_1 = _mm_subs_epi16(xmm4, xmm5);
    rho_rpi_7_3 = _mm_adds_epi16(xmm4, xmm6);
    rho_rmi_7_3 = _mm_subs_epi16(xmm4, xmm6);
    rho_rpi_7_5 = _mm_adds_epi16(xmm4, xmm7);
    rho_rmi_7_5 = _mm_subs_epi16(xmm4, xmm7);

    // Rearrange interfering MF output
    xmm0 = stream1_128i_in[i];
    xmm1 = stream1_128i_in[i+1];
    xmm0 = _mm_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    //xmm0 = [Re(0,1) Re(2,3) Im(0,1) Im(2,3)]
    //xmm1 = [Re(4,5) Re(6,7) Im(4,5) Im(6,7)]
    y1r = _mm_unpacklo_epi64(xmm0,xmm1); //[y1r(1),y1r(2),y1r(3),y1r(4)]
    y1i = _mm_unpackhi_epi64(xmm0,xmm1); //[y1i(1),y1i(2),y1i(3),y1i(4)]

    // Psi_r calculation from rho_rpi or rho_rmi
    xmm0 = _mm_setzero_si128(); // ZERO for abs_pi16
    xmm2 = _mm_subs_epi16(rho_rpi_7_7, y1r);
    psi_r_p7_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_7_5, y1r);
    psi_r_p7_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_7_3, y1r);
    psi_r_p7_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_7_1, y1r);
    psi_r_p7_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_7_1, y1r);
    psi_r_p7_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_7_3, y1r);
    psi_r_p7_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_7_5, y1r);
    psi_r_p7_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_7_7, y1r);
    psi_r_p7_m7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_5_7, y1r);
    psi_r_p5_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_5_5, y1r);
    psi_r_p5_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_5_3, y1r);
    psi_r_p5_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_5_1, y1r);
    psi_r_p5_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_5_1, y1r);
    psi_r_p5_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_5_3, y1r);
    psi_r_p5_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_5_5, y1r);
    psi_r_p5_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_5_7, y1r);
    psi_r_p5_m7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_3_7, y1r);
    psi_r_p3_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_3_5, y1r);
    psi_r_p3_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_3_3, y1r);
    psi_r_p3_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_3_1, y1r);
    psi_r_p3_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_3_1, y1r);
    psi_r_p3_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_3_3, y1r);
    psi_r_p3_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_3_5, y1r);
    psi_r_p3_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_3_7, y1r);
    psi_r_p3_m7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_1_7, y1r);
    psi_r_p1_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_1_5, y1r);
    psi_r_p1_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_1_3, y1r);
    psi_r_p1_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_1_1, y1r);
    psi_r_p1_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_1_1, y1r);
    psi_r_p1_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_1_3, y1r);
    psi_r_p1_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_1_5, y1r);
    psi_r_p1_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_1_7, y1r);
    psi_r_p1_m7 = _mm_abs_epi16(xmm2);

    xmm2 = _mm_adds_epi16(rho_rmi_1_7, y1r);
    psi_r_m1_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_1_5, y1r);
    psi_r_m1_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_1_3, y1r);
    psi_r_m1_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_1_1, y1r);
    psi_r_m1_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_1_1, y1r);
    psi_r_m1_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_1_3, y1r);
    psi_r_m1_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_1_5, y1r);
    psi_r_m1_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_1_7, y1r);
    psi_r_m1_m7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_3_7, y1r);
    psi_r_m3_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_3_5, y1r);
    psi_r_m3_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_3_3, y1r);
    psi_r_m3_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_3_1, y1r);
    psi_r_m3_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_3_1, y1r);
    psi_r_m3_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_3_3, y1r);
    psi_r_m3_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_3_5, y1r);
    psi_r_m3_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_3_7, y1r);
    psi_r_m3_m7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_5_7, y1r);
    psi_r_m5_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_5_5, y1r);
    psi_r_m5_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_5_3, y1r);
    psi_r_m5_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_5_1, y1r);
    psi_r_m5_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_5_1, y1r);
    psi_r_m5_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_5_3, y1r);
    psi_r_m5_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_5_5, y1r);
    psi_r_m5_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_5_7, y1r);
    psi_r_m5_m7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_7_7, y1r);
    psi_r_m7_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_7_5, y1r);
    psi_r_m7_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_7_3, y1r);
    psi_r_m7_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_7_1, y1r);
    psi_r_m7_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_7_1, y1r);
    psi_r_m7_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_7_3, y1r);
    psi_r_m7_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_7_5, y1r);
    psi_r_m7_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_7_7, y1r);
    psi_r_m7_m7 = _mm_abs_epi16(xmm2);

    // Psi_i calculation from rho_rpi or rho_rmi
    xmm2 = _mm_subs_epi16(rho_rmi_7_7, y1i);
    psi_i_p7_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_5_7, y1i);
    psi_i_p7_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_3_7, y1i);
    psi_i_p7_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_1_7, y1i);
    psi_i_p7_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_1_7, y1i);
    psi_i_p7_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_3_7, y1i);
    psi_i_p7_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_5_7, y1i);
    psi_i_p7_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_7_7, y1i);
    psi_i_p7_m7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_7_5, y1i);
    psi_i_p5_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_5_5, y1i);
    psi_i_p5_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_3_5, y1i);
    psi_i_p5_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_1_5, y1i);
    psi_i_p5_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_1_5, y1i);
    psi_i_p5_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_3_5, y1i);
    psi_i_p5_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_5_5, y1i);
    psi_i_p5_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_7_5, y1i);
    psi_i_p5_m7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_7_3, y1i);
    psi_i_p3_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_5_3, y1i);
    psi_i_p3_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_3_3, y1i);
    psi_i_p3_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_1_3, y1i);
    psi_i_p3_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_1_3, y1i);
    psi_i_p3_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_3_3, y1i);
    psi_i_p3_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_5_3, y1i);
    psi_i_p3_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_7_3, y1i);
    psi_i_p3_m7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_7_1, y1i);
    psi_i_p1_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_5_1, y1i);
    psi_i_p1_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_3_1, y1i);
    psi_i_p1_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rmi_1_1, y1i);
    psi_i_p1_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_1_1, y1i);
    psi_i_p1_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_3_1, y1i);
    psi_i_p1_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_5_1, y1i);
    psi_i_p1_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rpi_7_1, y1i);
    psi_i_p1_m7 = _mm_abs_epi16(xmm2);

    xmm2 = _mm_subs_epi16(rho_rpi_7_1, y1i);
    psi_i_m1_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_5_1, y1i);
    psi_i_m1_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_3_1, y1i);
    psi_i_m1_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_1_1, y1i);
    psi_i_m1_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_1_1, y1i);
    psi_i_m1_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_3_1, y1i);
    psi_i_m1_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_5_1, y1i);
    psi_i_m1_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_7_1, y1i);
    psi_i_m1_m7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_7_3, y1i);
    psi_i_m3_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_5_3, y1i);
    psi_i_m3_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_3_3, y1i);
    psi_i_m3_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_1_3, y1i);
    psi_i_m3_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_1_3, y1i);
    psi_i_m3_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_3_3, y1i);
    psi_i_m3_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_5_3, y1i);
    psi_i_m3_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_7_3, y1i);
    psi_i_m3_m7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_7_5, y1i);
    psi_i_m5_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_5_5, y1i);
    psi_i_m5_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_3_5, y1i);
    psi_i_m5_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_1_5, y1i);
    psi_i_m5_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_1_5, y1i);
    psi_i_m5_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_3_5, y1i);
    psi_i_m5_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_5_5, y1i);
    psi_i_m5_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_7_5, y1i);
    psi_i_m5_m7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_7_7, y1i);
    psi_i_m7_p7 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_5_7, y1i);
    psi_i_m7_p5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_3_7, y1i);
    psi_i_m7_p3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_subs_epi16(rho_rpi_1_7, y1i);
    psi_i_m7_p1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_1_7, y1i);
    psi_i_m7_m1 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_3_7, y1i);
    psi_i_m7_m3 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_5_7, y1i);
    psi_i_m7_m5 = _mm_abs_epi16(xmm2);
    xmm2 = _mm_adds_epi16(rho_rmi_7_7, y1i);
    psi_i_m7_m7 = _mm_abs_epi16(xmm2);


    // Rearrange desired MF output
    xmm0 = stream0_128i_in[i];
    xmm1 = stream0_128i_in[i+1];
    xmm0 = _mm_shufflelo_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shufflehi_epi16(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm0 = _mm_shuffle_epi32(xmm0,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflelo_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shufflehi_epi16(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm1 = _mm_shuffle_epi32(xmm1,0xd8); //_MM_SHUFFLE(0,2,1,3));
    //xmm0 = [Re(0,1) Re(2,3) Im(0,1) Im(2,3)]
    //xmm1 = [Re(4,5) Re(6,7) Im(4,5) Im(6,7)]
    y0r = _mm_unpacklo_epi64(xmm0,xmm1); // = [y0r(1),y0r(2),y0r(3),y0r(4)]
    y0i = _mm_unpackhi_epi64(xmm0,xmm1);

    // Rearrange desired channel magnitudes
    xmm2 = ch_mag_128i[i]; // = [|h|^2(1),|h|^2(1),|h|^2(2),|h|^2(2)]*(2/sqrt(10))
    xmm3 = ch_mag_128i[i+1]; // = [|h|^2(3),|h|^2(3),|h|^2(4),|h|^2(4)]*(2/sqrt(10))
    xmm2 = _mm_shufflelo_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm2 = _mm_shufflehi_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm2 = _mm_shuffle_epi32(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shufflelo_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shufflehi_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shuffle_epi32(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    ch_mag_des = _mm_unpacklo_epi64(xmm2,xmm3);

    // Rearrange interfering channel magnitudes
    xmm2 = ch_mag_128i_i[i];
    xmm3 = ch_mag_128i_i[i+1];
    xmm2 = _mm_shufflelo_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm2 = _mm_shufflehi_epi16(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm2 = _mm_shuffle_epi32(xmm2,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shufflelo_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shufflehi_epi16(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    xmm3 = _mm_shuffle_epi32(xmm3,0xd8); //_MM_SHUFFLE(0,2,1,3));
    ch_mag_int  = _mm_unpacklo_epi64(xmm2,xmm3);

    y0r_one_over_sqrt_21   = _mm_mulhi_epi16(y0r, ONE_OVER_SQRT_42);
    y0r_three_over_sqrt_21 = _mm_mulhi_epi16(y0r, THREE_OVER_SQRT_42);
    y0r_five_over_sqrt_21  = _mm_mulhi_epi16(y0r, FIVE_OVER_SQRT_42);
    y0r_five_over_sqrt_21  = _mm_slli_epi16(y0r_five_over_sqrt_21, 1);
    y0r_seven_over_sqrt_21 = _mm_mulhi_epi16(y0r, SEVEN_OVER_SQRT_42);
    y0r_seven_over_sqrt_21 = _mm_slli_epi16(y0r_seven_over_sqrt_21, 2); // Q2.14

    y0i_one_over_sqrt_21   = _mm_mulhi_epi16(y0i, ONE_OVER_SQRT_42);
    y0i_three_over_sqrt_21 = _mm_mulhi_epi16(y0i, THREE_OVER_SQRT_42);
    y0i_five_over_sqrt_21  = _mm_mulhi_epi16(y0i, FIVE_OVER_SQRT_42);
    y0i_five_over_sqrt_21  = _mm_slli_epi16(y0i_five_over_sqrt_21, 1);
    y0i_seven_over_sqrt_21 = _mm_mulhi_epi16(y0i, SEVEN_OVER_SQRT_42);
    y0i_seven_over_sqrt_21 = _mm_slli_epi16(y0i_seven_over_sqrt_21, 2); // Q2.14

    y0_p_7_1 = _mm_adds_epi16(y0r_seven_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_p_7_3 = _mm_adds_epi16(y0r_seven_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_p_7_5 = _mm_adds_epi16(y0r_seven_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_p_7_7 = _mm_adds_epi16(y0r_seven_over_sqrt_21, y0i_seven_over_sqrt_21);
    y0_p_5_1 = _mm_adds_epi16(y0r_five_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_p_5_3 = _mm_adds_epi16(y0r_five_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_p_5_5 = _mm_adds_epi16(y0r_five_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_p_5_7 = _mm_adds_epi16(y0r_five_over_sqrt_21, y0i_seven_over_sqrt_21);
    y0_p_3_1 = _mm_adds_epi16(y0r_three_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_p_3_3 = _mm_adds_epi16(y0r_three_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_p_3_5 = _mm_adds_epi16(y0r_three_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_p_3_7 = _mm_adds_epi16(y0r_three_over_sqrt_21, y0i_seven_over_sqrt_21);
    y0_p_1_1 = _mm_adds_epi16(y0r_one_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_p_1_3 = _mm_adds_epi16(y0r_one_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_p_1_5 = _mm_adds_epi16(y0r_one_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_p_1_7 = _mm_adds_epi16(y0r_one_over_sqrt_21, y0i_seven_over_sqrt_21);

    y0_m_1_1 = _mm_subs_epi16(y0r_one_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_m_1_3 = _mm_subs_epi16(y0r_one_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_m_1_5 = _mm_subs_epi16(y0r_one_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_m_1_7 = _mm_subs_epi16(y0r_one_over_sqrt_21, y0i_seven_over_sqrt_21);
    y0_m_3_1 = _mm_subs_epi16(y0r_three_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_m_3_3 = _mm_subs_epi16(y0r_three_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_m_3_5 = _mm_subs_epi16(y0r_three_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_m_3_7 = _mm_subs_epi16(y0r_three_over_sqrt_21, y0i_seven_over_sqrt_21);
    y0_m_5_1 = _mm_subs_epi16(y0r_five_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_m_5_3 = _mm_subs_epi16(y0r_five_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_m_5_5 = _mm_subs_epi16(y0r_five_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_m_5_7 = _mm_subs_epi16(y0r_five_over_sqrt_21, y0i_seven_over_sqrt_21);
    y0_m_7_1 = _mm_subs_epi16(y0r_seven_over_sqrt_21, y0i_one_over_sqrt_21);
    y0_m_7_3 = _mm_subs_epi16(y0r_seven_over_sqrt_21, y0i_three_over_sqrt_21);
    y0_m_7_5 = _mm_subs_epi16(y0r_seven_over_sqrt_21, y0i_five_over_sqrt_21);
    y0_m_7_7 = _mm_subs_epi16(y0r_seven_over_sqrt_21, y0i_seven_over_sqrt_21);

    // Detection of interference term
    ch_mag_int_with_sigma2       = _mm_srai_epi16(ch_mag_int, 1); // *2
    two_ch_mag_int_with_sigma2   = ch_mag_int; // *4
    three_ch_mag_int_with_sigma2 = _mm_adds_epi16(ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2); // *6

    interference_abs_64qam_epi16(psi_r_p7_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p7_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p7_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p7_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p7_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p7_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p7_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p7_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p7_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p7_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p7_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p7_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p7_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p7_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p7_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p7_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p5_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p5_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p5_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p5_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p5_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p5_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p5_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p5_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p5_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p5_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p5_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p5_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p5_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p5_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p5_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p5_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p3_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p3_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p3_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p3_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p3_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p3_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p3_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p3_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p3_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p3_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p3_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p3_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p3_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p3_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p3_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p3_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p1_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p1_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p1_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p1_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p1_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p1_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p1_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p1_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p1_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p1_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p1_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p1_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p1_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p1_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_p1_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_p1_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m1_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m1_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m1_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m1_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m1_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m1_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m1_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m1_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m1_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m1_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m1_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m1_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m1_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m1_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m1_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m1_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m3_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m3_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m3_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m3_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m3_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m3_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m3_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m3_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m3_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m3_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m3_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m3_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m3_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m3_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m3_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m3_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m5_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m5_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m5_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m5_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m5_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m5_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m5_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m5_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m5_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m5_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m5_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m5_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m5_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m5_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m5_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m5_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m7_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m7_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m7_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m7_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m7_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m7_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m7_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m7_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m7_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m7_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m7_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m7_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m7_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m7_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_r_m7_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_r_m7_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);

    interference_abs_64qam_epi16(psi_i_p7_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p7_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p7_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p7_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p7_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p7_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p7_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p7_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p7_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p7_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p7_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p7_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p7_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p7_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p7_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p7_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p5_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p5_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p5_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p5_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p5_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p5_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p5_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p5_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p5_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p5_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p5_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p5_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p5_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p5_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p5_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p5_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p3_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p3_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p3_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p3_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p3_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p3_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p3_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p3_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p3_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p3_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p3_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p3_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p3_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p3_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p3_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p3_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p1_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p1_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p1_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p1_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p1_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p1_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p1_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p1_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p1_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p1_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p1_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p1_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p1_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p1_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_p1_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_p1_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m1_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m1_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m1_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m1_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m1_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m1_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m1_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m1_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m1_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m1_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m1_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m1_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m1_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m1_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m1_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m1_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m3_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m3_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m3_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m3_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m3_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m3_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m3_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m3_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m3_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m3_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m3_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m3_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m3_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m3_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m3_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m3_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m5_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m5_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m5_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m5_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m5_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m5_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m5_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m5_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m5_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m5_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m5_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m5_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m5_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m5_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m5_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m5_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m7_p7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m7_p7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m7_p5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m7_p5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m7_p3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m7_p3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m7_p1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m7_p1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m7_m1, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m7_m1, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m7_m3, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m7_m3, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m7_m5, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m7_m5, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);
    interference_abs_64qam_epi16(psi_i_m7_m7, ch_mag_int_with_sigma2, two_ch_mag_int_with_sigma2, three_ch_mag_int_with_sigma2, a_i_m7_m7, ONE_OVER_SQRT_2_42, THREE_OVER_SQRT_2_42, FIVE_OVER_SQRT_2_42,
                                 SEVEN_OVER_SQRT_2_42);

    // Calculation of a group of two terms in the bit metric involving product of psi and interference
    prodsum_psi_a_epi16(psi_r_p7_p7, a_r_p7_p7, psi_i_p7_p7, a_i_p7_p7, psi_a_p7_p7);
    prodsum_psi_a_epi16(psi_r_p7_p5, a_r_p7_p5, psi_i_p7_p5, a_i_p7_p5, psi_a_p7_p5);
    prodsum_psi_a_epi16(psi_r_p7_p3, a_r_p7_p3, psi_i_p7_p3, a_i_p7_p3, psi_a_p7_p3);
    prodsum_psi_a_epi16(psi_r_p7_p1, a_r_p7_p1, psi_i_p7_p1, a_i_p7_p1, psi_a_p7_p1);
    prodsum_psi_a_epi16(psi_r_p7_m1, a_r_p7_m1, psi_i_p7_m1, a_i_p7_m1, psi_a_p7_m1);
    prodsum_psi_a_epi16(psi_r_p7_m3, a_r_p7_m3, psi_i_p7_m3, a_i_p7_m3, psi_a_p7_m3);
    prodsum_psi_a_epi16(psi_r_p7_m5, a_r_p7_m5, psi_i_p7_m5, a_i_p7_m5, psi_a_p7_m5);
    prodsum_psi_a_epi16(psi_r_p7_m7, a_r_p7_m7, psi_i_p7_m7, a_i_p7_m7, psi_a_p7_m7);
    prodsum_psi_a_epi16(psi_r_p5_p7, a_r_p5_p7, psi_i_p5_p7, a_i_p5_p7, psi_a_p5_p7);
    prodsum_psi_a_epi16(psi_r_p5_p5, a_r_p5_p5, psi_i_p5_p5, a_i_p5_p5, psi_a_p5_p5);
    prodsum_psi_a_epi16(psi_r_p5_p3, a_r_p5_p3, psi_i_p5_p3, a_i_p5_p3, psi_a_p5_p3);
    prodsum_psi_a_epi16(psi_r_p5_p1, a_r_p5_p1, psi_i_p5_p1, a_i_p5_p1, psi_a_p5_p1);
    prodsum_psi_a_epi16(psi_r_p5_m1, a_r_p5_m1, psi_i_p5_m1, a_i_p5_m1, psi_a_p5_m1);
    prodsum_psi_a_epi16(psi_r_p5_m3, a_r_p5_m3, psi_i_p5_m3, a_i_p5_m3, psi_a_p5_m3);
    prodsum_psi_a_epi16(psi_r_p5_m5, a_r_p5_m5, psi_i_p5_m5, a_i_p5_m5, psi_a_p5_m5);
    prodsum_psi_a_epi16(psi_r_p5_m7, a_r_p5_m7, psi_i_p5_m7, a_i_p5_m7, psi_a_p5_m7);
    prodsum_psi_a_epi16(psi_r_p3_p7, a_r_p3_p7, psi_i_p3_p7, a_i_p3_p7, psi_a_p3_p7);
    prodsum_psi_a_epi16(psi_r_p3_p5, a_r_p3_p5, psi_i_p3_p5, a_i_p3_p5, psi_a_p3_p5);
    prodsum_psi_a_epi16(psi_r_p3_p3, a_r_p3_p3, psi_i_p3_p3, a_i_p3_p3, psi_a_p3_p3);
    prodsum_psi_a_epi16(psi_r_p3_p1, a_r_p3_p1, psi_i_p3_p1, a_i_p3_p1, psi_a_p3_p1);
    prodsum_psi_a_epi16(psi_r_p3_m1, a_r_p3_m1, psi_i_p3_m1, a_i_p3_m1, psi_a_p3_m1);
    prodsum_psi_a_epi16(psi_r_p3_m3, a_r_p3_m3, psi_i_p3_m3, a_i_p3_m3, psi_a_p3_m3);
    prodsum_psi_a_epi16(psi_r_p3_m5, a_r_p3_m5, psi_i_p3_m5, a_i_p3_m5, psi_a_p3_m5);
    prodsum_psi_a_epi16(psi_r_p3_m7, a_r_p3_m7, psi_i_p3_m7, a_i_p3_m7, psi_a_p3_m7);
    prodsum_psi_a_epi16(psi_r_p1_p7, a_r_p1_p7, psi_i_p1_p7, a_i_p1_p7, psi_a_p1_p7);
    prodsum_psi_a_epi16(psi_r_p1_p5, a_r_p1_p5, psi_i_p1_p5, a_i_p1_p5, psi_a_p1_p5);
    prodsum_psi_a_epi16(psi_r_p1_p3, a_r_p1_p3, psi_i_p1_p3, a_i_p1_p3, psi_a_p1_p3);
    prodsum_psi_a_epi16(psi_r_p1_p1, a_r_p1_p1, psi_i_p1_p1, a_i_p1_p1, psi_a_p1_p1);
    prodsum_psi_a_epi16(psi_r_p1_m1, a_r_p1_m1, psi_i_p1_m1, a_i_p1_m1, psi_a_p1_m1);
    prodsum_psi_a_epi16(psi_r_p1_m3, a_r_p1_m3, psi_i_p1_m3, a_i_p1_m3, psi_a_p1_m3);
    prodsum_psi_a_epi16(psi_r_p1_m5, a_r_p1_m5, psi_i_p1_m5, a_i_p1_m5, psi_a_p1_m5);
    prodsum_psi_a_epi16(psi_r_p1_m7, a_r_p1_m7, psi_i_p1_m7, a_i_p1_m7, psi_a_p1_m7);
    prodsum_psi_a_epi16(psi_r_m1_p7, a_r_m1_p7, psi_i_m1_p7, a_i_m1_p7, psi_a_m1_p7);
    prodsum_psi_a_epi16(psi_r_m1_p5, a_r_m1_p5, psi_i_m1_p5, a_i_m1_p5, psi_a_m1_p5);
    prodsum_psi_a_epi16(psi_r_m1_p3, a_r_m1_p3, psi_i_m1_p3, a_i_m1_p3, psi_a_m1_p3);
    prodsum_psi_a_epi16(psi_r_m1_p1, a_r_m1_p1, psi_i_m1_p1, a_i_m1_p1, psi_a_m1_p1);
    prodsum_psi_a_epi16(psi_r_m1_m1, a_r_m1_m1, psi_i_m1_m1, a_i_m1_m1, psi_a_m1_m1);
    prodsum_psi_a_epi16(psi_r_m1_m3, a_r_m1_m3, psi_i_m1_m3, a_i_m1_m3, psi_a_m1_m3);
    prodsum_psi_a_epi16(psi_r_m1_m5, a_r_m1_m5, psi_i_m1_m5, a_i_m1_m5, psi_a_m1_m5);
    prodsum_psi_a_epi16(psi_r_m1_m7, a_r_m1_m7, psi_i_m1_m7, a_i_m1_m7, psi_a_m1_m7);
    prodsum_psi_a_epi16(psi_r_m3_p7, a_r_m3_p7, psi_i_m3_p7, a_i_m3_p7, psi_a_m3_p7);
    prodsum_psi_a_epi16(psi_r_m3_p5, a_r_m3_p5, psi_i_m3_p5, a_i_m3_p5, psi_a_m3_p5);
    prodsum_psi_a_epi16(psi_r_m3_p3, a_r_m3_p3, psi_i_m3_p3, a_i_m3_p3, psi_a_m3_p3);
    prodsum_psi_a_epi16(psi_r_m3_p1, a_r_m3_p1, psi_i_m3_p1, a_i_m3_p1, psi_a_m3_p1);
    prodsum_psi_a_epi16(psi_r_m3_m1, a_r_m3_m1, psi_i_m3_m1, a_i_m3_m1, psi_a_m3_m1);
    prodsum_psi_a_epi16(psi_r_m3_m3, a_r_m3_m3, psi_i_m3_m3, a_i_m3_m3, psi_a_m3_m3);
    prodsum_psi_a_epi16(psi_r_m3_m5, a_r_m3_m5, psi_i_m3_m5, a_i_m3_m5, psi_a_m3_m5);
    prodsum_psi_a_epi16(psi_r_m3_m7, a_r_m3_m7, psi_i_m3_m7, a_i_m3_m7, psi_a_m3_m7);
    prodsum_psi_a_epi16(psi_r_m5_p7, a_r_m5_p7, psi_i_m5_p7, a_i_m5_p7, psi_a_m5_p7);
    prodsum_psi_a_epi16(psi_r_m5_p5, a_r_m5_p5, psi_i_m5_p5, a_i_m5_p5, psi_a_m5_p5);
    prodsum_psi_a_epi16(psi_r_m5_p3, a_r_m5_p3, psi_i_m5_p3, a_i_m5_p3, psi_a_m5_p3);
    prodsum_psi_a_epi16(psi_r_m5_p1, a_r_m5_p1, psi_i_m5_p1, a_i_m5_p1, psi_a_m5_p1);
    prodsum_psi_a_epi16(psi_r_m5_m1, a_r_m5_m1, psi_i_m5_m1, a_i_m5_m1, psi_a_m5_m1);
    prodsum_psi_a_epi16(psi_r_m5_m3, a_r_m5_m3, psi_i_m5_m3, a_i_m5_m3, psi_a_m5_m3);
    prodsum_psi_a_epi16(psi_r_m5_m5, a_r_m5_m5, psi_i_m5_m5, a_i_m5_m5, psi_a_m5_m5);
    prodsum_psi_a_epi16(psi_r_m5_m7, a_r_m5_m7, psi_i_m5_m7, a_i_m5_m7, psi_a_m5_m7);
    prodsum_psi_a_epi16(psi_r_m7_p7, a_r_m7_p7, psi_i_m7_p7, a_i_m7_p7, psi_a_m7_p7);
    prodsum_psi_a_epi16(psi_r_m7_p5, a_r_m7_p5, psi_i_m7_p5, a_i_m7_p5, psi_a_m7_p5);
    prodsum_psi_a_epi16(psi_r_m7_p3, a_r_m7_p3, psi_i_m7_p3, a_i_m7_p3, psi_a_m7_p3);
    prodsum_psi_a_epi16(psi_r_m7_p1, a_r_m7_p1, psi_i_m7_p1, a_i_m7_p1, psi_a_m7_p1);
    prodsum_psi_a_epi16(psi_r_m7_m1, a_r_m7_m1, psi_i_m7_m1, a_i_m7_m1, psi_a_m7_m1);
    prodsum_psi_a_epi16(psi_r_m7_m3, a_r_m7_m3, psi_i_m7_m3, a_i_m7_m3, psi_a_m7_m3);
    prodsum_psi_a_epi16(psi_r_m7_m5, a_r_m7_m5, psi_i_m7_m5, a_i_m7_m5, psi_a_m7_m5);
    prodsum_psi_a_epi16(psi_r_m7_m7, a_r_m7_m7, psi_i_m7_m7, a_i_m7_m7, psi_a_m7_m7);

    // Multiply by sqrt(2)
    psi_a_p7_p7 = _mm_mulhi_epi16(psi_a_p7_p7, ONE_OVER_SQRT_2);
    psi_a_p7_p7 = _mm_slli_epi16(psi_a_p7_p7, 2);
    psi_a_p7_p5 = _mm_mulhi_epi16(psi_a_p7_p5, ONE_OVER_SQRT_2);
    psi_a_p7_p5 = _mm_slli_epi16(psi_a_p7_p5, 2);
    psi_a_p7_p3 = _mm_mulhi_epi16(psi_a_p7_p3, ONE_OVER_SQRT_2);
    psi_a_p7_p3 = _mm_slli_epi16(psi_a_p7_p3, 2);
    psi_a_p7_p1 = _mm_mulhi_epi16(psi_a_p7_p1, ONE_OVER_SQRT_2);
    psi_a_p7_p1 = _mm_slli_epi16(psi_a_p7_p1, 2);
    psi_a_p7_m1 = _mm_mulhi_epi16(psi_a_p7_m1, ONE_OVER_SQRT_2);
    psi_a_p7_m1 = _mm_slli_epi16(psi_a_p7_m1, 2);
    psi_a_p7_m3 = _mm_mulhi_epi16(psi_a_p7_m3, ONE_OVER_SQRT_2);
    psi_a_p7_m3 = _mm_slli_epi16(psi_a_p7_m3, 2);
    psi_a_p7_m5 = _mm_mulhi_epi16(psi_a_p7_m5, ONE_OVER_SQRT_2);
    psi_a_p7_m5 = _mm_slli_epi16(psi_a_p7_m5, 2);
    psi_a_p7_m7 = _mm_mulhi_epi16(psi_a_p7_m7, ONE_OVER_SQRT_2);
    psi_a_p7_m7 = _mm_slli_epi16(psi_a_p7_m7, 2);
    psi_a_p5_p7 = _mm_mulhi_epi16(psi_a_p5_p7, ONE_OVER_SQRT_2);
    psi_a_p5_p7 = _mm_slli_epi16(psi_a_p5_p7, 2);
    psi_a_p5_p5 = _mm_mulhi_epi16(psi_a_p5_p5, ONE_OVER_SQRT_2);
    psi_a_p5_p5 = _mm_slli_epi16(psi_a_p5_p5, 2);
    psi_a_p5_p3 = _mm_mulhi_epi16(psi_a_p5_p3, ONE_OVER_SQRT_2);
    psi_a_p5_p3 = _mm_slli_epi16(psi_a_p5_p3, 2);
    psi_a_p5_p1 = _mm_mulhi_epi16(psi_a_p5_p1, ONE_OVER_SQRT_2);
    psi_a_p5_p1 = _mm_slli_epi16(psi_a_p5_p1, 2);
    psi_a_p5_m1 = _mm_mulhi_epi16(psi_a_p5_m1, ONE_OVER_SQRT_2);
    psi_a_p5_m1 = _mm_slli_epi16(psi_a_p5_m1, 2);
    psi_a_p5_m3 = _mm_mulhi_epi16(psi_a_p5_m3, ONE_OVER_SQRT_2);
    psi_a_p5_m3 = _mm_slli_epi16(psi_a_p5_m3, 2);
    psi_a_p5_m5 = _mm_mulhi_epi16(psi_a_p5_m5, ONE_OVER_SQRT_2);
    psi_a_p5_m5 = _mm_slli_epi16(psi_a_p5_m5, 2);
    psi_a_p5_m7 = _mm_mulhi_epi16(psi_a_p5_m7, ONE_OVER_SQRT_2);
    psi_a_p5_m7 = _mm_slli_epi16(psi_a_p5_m7, 2);
    psi_a_p3_p7 = _mm_mulhi_epi16(psi_a_p3_p7, ONE_OVER_SQRT_2);
    psi_a_p3_p7 = _mm_slli_epi16(psi_a_p3_p7, 2);
    psi_a_p3_p5 = _mm_mulhi_epi16(psi_a_p3_p5, ONE_OVER_SQRT_2);
    psi_a_p3_p5 = _mm_slli_epi16(psi_a_p3_p5, 2);
    psi_a_p3_p3 = _mm_mulhi_epi16(psi_a_p3_p3, ONE_OVER_SQRT_2);
    psi_a_p3_p3 = _mm_slli_epi16(psi_a_p3_p3, 2);
    psi_a_p3_p1 = _mm_mulhi_epi16(psi_a_p3_p1, ONE_OVER_SQRT_2);
    psi_a_p3_p1 = _mm_slli_epi16(psi_a_p3_p1, 2);
    psi_a_p3_m1 = _mm_mulhi_epi16(psi_a_p3_m1, ONE_OVER_SQRT_2);
    psi_a_p3_m1 = _mm_slli_epi16(psi_a_p3_m1, 2);
    psi_a_p3_m3 = _mm_mulhi_epi16(psi_a_p3_m3, ONE_OVER_SQRT_2);
    psi_a_p3_m3 = _mm_slli_epi16(psi_a_p3_m3, 2);
    psi_a_p3_m5 = _mm_mulhi_epi16(psi_a_p3_m5, ONE_OVER_SQRT_2);
    psi_a_p3_m5 = _mm_slli_epi16(psi_a_p3_m5, 2);
    psi_a_p3_m7 = _mm_mulhi_epi16(psi_a_p3_m7, ONE_OVER_SQRT_2);
    psi_a_p3_m7 = _mm_slli_epi16(psi_a_p3_m7, 2);
    psi_a_p1_p7 = _mm_mulhi_epi16(psi_a_p1_p7, ONE_OVER_SQRT_2);
    psi_a_p1_p7 = _mm_slli_epi16(psi_a_p1_p7, 2);
    psi_a_p1_p5 = _mm_mulhi_epi16(psi_a_p1_p5, ONE_OVER_SQRT_2);
    psi_a_p1_p5 = _mm_slli_epi16(psi_a_p1_p5, 2);
    psi_a_p1_p3 = _mm_mulhi_epi16(psi_a_p1_p3, ONE_OVER_SQRT_2);
    psi_a_p1_p3 = _mm_slli_epi16(psi_a_p1_p3, 2);
    psi_a_p1_p1 = _mm_mulhi_epi16(psi_a_p1_p1, ONE_OVER_SQRT_2);
    psi_a_p1_p1 = _mm_slli_epi16(psi_a_p1_p1, 2);
    psi_a_p1_m1 = _mm_mulhi_epi16(psi_a_p1_m1, ONE_OVER_SQRT_2);
    psi_a_p1_m1 = _mm_slli_epi16(psi_a_p1_m1, 2);
    psi_a_p1_m3 = _mm_mulhi_epi16(psi_a_p1_m3, ONE_OVER_SQRT_2);
    psi_a_p1_m3 = _mm_slli_epi16(psi_a_p1_m3, 2);
    psi_a_p1_m5 = _mm_mulhi_epi16(psi_a_p1_m5, ONE_OVER_SQRT_2);
    psi_a_p1_m5 = _mm_slli_epi16(psi_a_p1_m5, 2);
    psi_a_p1_m7 = _mm_mulhi_epi16(psi_a_p1_m7, ONE_OVER_SQRT_2);
    psi_a_p1_m7 = _mm_slli_epi16(psi_a_p1_m7, 2);
    psi_a_m1_p7 = _mm_mulhi_epi16(psi_a_m1_p7, ONE_OVER_SQRT_2);
    psi_a_m1_p7 = _mm_slli_epi16(psi_a_m1_p7, 2);
    psi_a_m1_p5 = _mm_mulhi_epi16(psi_a_m1_p5, ONE_OVER_SQRT_2);
    psi_a_m1_p5 = _mm_slli_epi16(psi_a_m1_p5, 2);
    psi_a_m1_p3 = _mm_mulhi_epi16(psi_a_m1_p3, ONE_OVER_SQRT_2);
    psi_a_m1_p3 = _mm_slli_epi16(psi_a_m1_p3, 2);
    psi_a_m1_p1 = _mm_mulhi_epi16(psi_a_m1_p1, ONE_OVER_SQRT_2);
    psi_a_m1_p1 = _mm_slli_epi16(psi_a_m1_p1, 2);
    psi_a_m1_m1 = _mm_mulhi_epi16(psi_a_m1_m1, ONE_OVER_SQRT_2);
    psi_a_m1_m1 = _mm_slli_epi16(psi_a_m1_m1, 2);
    psi_a_m1_m3 = _mm_mulhi_epi16(psi_a_m1_m3, ONE_OVER_SQRT_2);
    psi_a_m1_m3 = _mm_slli_epi16(psi_a_m1_m3, 2);
    psi_a_m1_m5 = _mm_mulhi_epi16(psi_a_m1_m5, ONE_OVER_SQRT_2);
    psi_a_m1_m5 = _mm_slli_epi16(psi_a_m1_m5, 2);
    psi_a_m1_m7 = _mm_mulhi_epi16(psi_a_m1_m7, ONE_OVER_SQRT_2);
    psi_a_m1_m7 = _mm_slli_epi16(psi_a_m1_m7, 2);
    psi_a_m3_p7 = _mm_mulhi_epi16(psi_a_m3_p7, ONE_OVER_SQRT_2);
    psi_a_m3_p7 = _mm_slli_epi16(psi_a_m3_p7, 2);
    psi_a_m3_p5 = _mm_mulhi_epi16(psi_a_m3_p5, ONE_OVER_SQRT_2);
    psi_a_m3_p5 = _mm_slli_epi16(psi_a_m3_p5, 2);
    psi_a_m3_p3 = _mm_mulhi_epi16(psi_a_m3_p3, ONE_OVER_SQRT_2);
    psi_a_m3_p3 = _mm_slli_epi16(psi_a_m3_p3, 2);
    psi_a_m3_p1 = _mm_mulhi_epi16(psi_a_m3_p1, ONE_OVER_SQRT_2);
    psi_a_m3_p1 = _mm_slli_epi16(psi_a_m3_p1, 2);
    psi_a_m3_m1 = _mm_mulhi_epi16(psi_a_m3_m1, ONE_OVER_SQRT_2);
    psi_a_m3_m1 = _mm_slli_epi16(psi_a_m3_m1, 2);
    psi_a_m3_m3 = _mm_mulhi_epi16(psi_a_m3_m3, ONE_OVER_SQRT_2);
    psi_a_m3_m3 = _mm_slli_epi16(psi_a_m3_m3, 2);
    psi_a_m3_m5 = _mm_mulhi_epi16(psi_a_m3_m5, ONE_OVER_SQRT_2);
    psi_a_m3_m5 = _mm_slli_epi16(psi_a_m3_m5, 2);
    psi_a_m3_m7 = _mm_mulhi_epi16(psi_a_m3_m7, ONE_OVER_SQRT_2);
    psi_a_m3_m7 = _mm_slli_epi16(psi_a_m3_m7, 2);
    psi_a_m5_p7 = _mm_mulhi_epi16(psi_a_m5_p7, ONE_OVER_SQRT_2);
    psi_a_m5_p7 = _mm_slli_epi16(psi_a_m5_p7, 2);
    psi_a_m5_p5 = _mm_mulhi_epi16(psi_a_m5_p5, ONE_OVER_SQRT_2);
    psi_a_m5_p5 = _mm_slli_epi16(psi_a_m5_p5, 2);
    psi_a_m5_p3 = _mm_mulhi_epi16(psi_a_m5_p3, ONE_OVER_SQRT_2);
    psi_a_m5_p3 = _mm_slli_epi16(psi_a_m5_p3, 2);
    psi_a_m5_p1 = _mm_mulhi_epi16(psi_a_m5_p1, ONE_OVER_SQRT_2);
    psi_a_m5_p1 = _mm_slli_epi16(psi_a_m5_p1, 2);
    psi_a_m5_m1 = _mm_mulhi_epi16(psi_a_m5_m1, ONE_OVER_SQRT_2);
    psi_a_m5_m1 = _mm_slli_epi16(psi_a_m5_m1, 2);
    psi_a_m5_m3 = _mm_mulhi_epi16(psi_a_m5_m3, ONE_OVER_SQRT_2);
    psi_a_m5_m3 = _mm_slli_epi16(psi_a_m5_m3, 2);
    psi_a_m5_m5 = _mm_mulhi_epi16(psi_a_m5_m5, ONE_OVER_SQRT_2);
    psi_a_m5_m5 = _mm_slli_epi16(psi_a_m5_m5, 2);
    psi_a_m5_m7 = _mm_mulhi_epi16(psi_a_m5_m7, ONE_OVER_SQRT_2);
    psi_a_m5_m7 = _mm_slli_epi16(psi_a_m5_m7, 2);
    psi_a_m7_p7 = _mm_mulhi_epi16(psi_a_m7_p7, ONE_OVER_SQRT_2);
    psi_a_m7_p7 = _mm_slli_epi16(psi_a_m7_p7, 2);
    psi_a_m7_p5 = _mm_mulhi_epi16(psi_a_m7_p5, ONE_OVER_SQRT_2);
    psi_a_m7_p5 = _mm_slli_epi16(psi_a_m7_p5, 2);
    psi_a_m7_p3 = _mm_mulhi_epi16(psi_a_m7_p3, ONE_OVER_SQRT_2);
    psi_a_m7_p3 = _mm_slli_epi16(psi_a_m7_p3, 2);
    psi_a_m7_p1 = _mm_mulhi_epi16(psi_a_m7_p1, ONE_OVER_SQRT_2);
    psi_a_m7_p1 = _mm_slli_epi16(psi_a_m7_p1, 2);
    psi_a_m7_m1 = _mm_mulhi_epi16(psi_a_m7_m1, ONE_OVER_SQRT_2);
    psi_a_m7_m1 = _mm_slli_epi16(psi_a_m7_m1, 2);
    psi_a_m7_m3 = _mm_mulhi_epi16(psi_a_m7_m3, ONE_OVER_SQRT_2);
    psi_a_m7_m3 = _mm_slli_epi16(psi_a_m7_m3, 2);
    psi_a_m7_m5 = _mm_mulhi_epi16(psi_a_m7_m5, ONE_OVER_SQRT_2);
    psi_a_m7_m5 = _mm_slli_epi16(psi_a_m7_m5, 2);
    psi_a_m7_m7 = _mm_mulhi_epi16(psi_a_m7_m7, ONE_OVER_SQRT_2);
    psi_a_m7_m7 = _mm_slli_epi16(psi_a_m7_m7, 2);

    // Calculation of a group of two terms in the bit metric involving squares of interference
    square_a_64qam_epi16(a_r_p7_p7, a_i_p7_p7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p7_p7);
    square_a_64qam_epi16(a_r_p7_p5, a_i_p7_p5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p7_p5);
    square_a_64qam_epi16(a_r_p7_p3, a_i_p7_p3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p7_p3);
    square_a_64qam_epi16(a_r_p7_p1, a_i_p7_p1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p7_p1);
    square_a_64qam_epi16(a_r_p7_m1, a_i_p7_m1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p7_m1);
    square_a_64qam_epi16(a_r_p7_m3, a_i_p7_m3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p7_m3);
    square_a_64qam_epi16(a_r_p7_m5, a_i_p7_m5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p7_m5);
    square_a_64qam_epi16(a_r_p7_m7, a_i_p7_m7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p7_m7);
    square_a_64qam_epi16(a_r_p5_p7, a_i_p5_p7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p5_p7);
    square_a_64qam_epi16(a_r_p5_p5, a_i_p5_p5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p5_p5);
    square_a_64qam_epi16(a_r_p5_p3, a_i_p5_p3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p5_p3);
    square_a_64qam_epi16(a_r_p5_p1, a_i_p5_p1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p5_p1);
    square_a_64qam_epi16(a_r_p5_m1, a_i_p5_m1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p5_m1);
    square_a_64qam_epi16(a_r_p5_m3, a_i_p5_m3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p5_m3);
    square_a_64qam_epi16(a_r_p5_m5, a_i_p5_m5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p5_m5);
    square_a_64qam_epi16(a_r_p5_m7, a_i_p5_m7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p5_m7);
    square_a_64qam_epi16(a_r_p3_p7, a_i_p3_p7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p3_p7);
    square_a_64qam_epi16(a_r_p3_p5, a_i_p3_p5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p3_p5);
    square_a_64qam_epi16(a_r_p3_p3, a_i_p3_p3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p3_p3);
    square_a_64qam_epi16(a_r_p3_p1, a_i_p3_p1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p3_p1);
    square_a_64qam_epi16(a_r_p3_m1, a_i_p3_m1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p3_m1);
    square_a_64qam_epi16(a_r_p3_m3, a_i_p3_m3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p3_m3);
    square_a_64qam_epi16(a_r_p3_m5, a_i_p3_m5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p3_m5);
    square_a_64qam_epi16(a_r_p3_m7, a_i_p3_m7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p3_m7);
    square_a_64qam_epi16(a_r_p1_p7, a_i_p1_p7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p1_p7);
    square_a_64qam_epi16(a_r_p1_p5, a_i_p1_p5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p1_p5);
    square_a_64qam_epi16(a_r_p1_p3, a_i_p1_p3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p1_p3);
    square_a_64qam_epi16(a_r_p1_p1, a_i_p1_p1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p1_p1);
    square_a_64qam_epi16(a_r_p1_m1, a_i_p1_m1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p1_m1);
    square_a_64qam_epi16(a_r_p1_m3, a_i_p1_m3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p1_m3);
    square_a_64qam_epi16(a_r_p1_m5, a_i_p1_m5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p1_m5);
    square_a_64qam_epi16(a_r_p1_m7, a_i_p1_m7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_p1_m7);
    square_a_64qam_epi16(a_r_m1_p7, a_i_m1_p7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m1_p7);
    square_a_64qam_epi16(a_r_m1_p5, a_i_m1_p5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m1_p5);
    square_a_64qam_epi16(a_r_m1_p3, a_i_m1_p3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m1_p3);
    square_a_64qam_epi16(a_r_m1_p1, a_i_m1_p1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m1_p1);
    square_a_64qam_epi16(a_r_m1_m1, a_i_m1_m1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m1_m1);
    square_a_64qam_epi16(a_r_m1_m3, a_i_m1_m3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m1_m3);
    square_a_64qam_epi16(a_r_m1_m5, a_i_m1_m5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m1_m5);
    square_a_64qam_epi16(a_r_m1_m7, a_i_m1_m7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m1_m7);
    square_a_64qam_epi16(a_r_m3_p7, a_i_m3_p7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m3_p7);
    square_a_64qam_epi16(a_r_m3_p5, a_i_m3_p5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m3_p5);
    square_a_64qam_epi16(a_r_m3_p3, a_i_m3_p3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m3_p3);
    square_a_64qam_epi16(a_r_m3_p1, a_i_m3_p1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m3_p1);
    square_a_64qam_epi16(a_r_m3_m1, a_i_m3_m1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m3_m1);
    square_a_64qam_epi16(a_r_m3_m3, a_i_m3_m3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m3_m3);
    square_a_64qam_epi16(a_r_m3_m5, a_i_m3_m5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m3_m5);
    square_a_64qam_epi16(a_r_m3_m7, a_i_m3_m7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m3_m7);
    square_a_64qam_epi16(a_r_m5_p7, a_i_m5_p7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m5_p7);
    square_a_64qam_epi16(a_r_m5_p5, a_i_m5_p5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m5_p5);
    square_a_64qam_epi16(a_r_m5_p3, a_i_m5_p3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m5_p3);
    square_a_64qam_epi16(a_r_m5_p1, a_i_m5_p1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m5_p1);
    square_a_64qam_epi16(a_r_m5_m1, a_i_m5_m1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m5_m1);
    square_a_64qam_epi16(a_r_m5_m3, a_i_m5_m3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m5_m3);
    square_a_64qam_epi16(a_r_m5_m5, a_i_m5_m5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m5_m5);
    square_a_64qam_epi16(a_r_m5_m7, a_i_m5_m7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m5_m7);
    square_a_64qam_epi16(a_r_m7_p7, a_i_m7_p7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m7_p7);
    square_a_64qam_epi16(a_r_m7_p5, a_i_m7_p5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m7_p5);
    square_a_64qam_epi16(a_r_m7_p3, a_i_m7_p3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m7_p3);
    square_a_64qam_epi16(a_r_m7_p1, a_i_m7_p1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m7_p1);
    square_a_64qam_epi16(a_r_m7_m1, a_i_m7_m1, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m7_m1);
    square_a_64qam_epi16(a_r_m7_m3, a_i_m7_m3, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m7_m3);
    square_a_64qam_epi16(a_r_m7_m5, a_i_m7_m5, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m7_m5);
    square_a_64qam_epi16(a_r_m7_m7, a_i_m7_m7, ch_mag_int, SQRT_42_OVER_FOUR, a_sq_m7_m7);

    // Computing different multiples of ||h0||^2
    // x=1, y=1
    ch_mag_2_over_42_with_sigma2 = _mm_mulhi_epi16(ch_mag_des,ONE_OVER_FOUR_SQRT_42);
    ch_mag_2_over_42_with_sigma2 = _mm_slli_epi16(ch_mag_2_over_42_with_sigma2,1);
    // x=1, y=3
    ch_mag_10_over_42_with_sigma2 = _mm_mulhi_epi16(ch_mag_des,FIVE_OVER_FOUR_SQRT_42);
    ch_mag_10_over_42_with_sigma2 = _mm_slli_epi16(ch_mag_10_over_42_with_sigma2,1);
    // x=1, x=5
    ch_mag_26_over_42_with_sigma2 = _mm_mulhi_epi16(ch_mag_des,THIRTEEN_OVER_FOUR_SQRT_42);
    ch_mag_26_over_42_with_sigma2 = _mm_slli_epi16(ch_mag_26_over_42_with_sigma2,1);
    // x=1, y=7
    ch_mag_50_over_42_with_sigma2 = _mm_mulhi_epi16(ch_mag_des,TWENTYFIVE_OVER_FOUR_SQRT_42);
    ch_mag_50_over_42_with_sigma2 = _mm_slli_epi16(ch_mag_50_over_42_with_sigma2,1);
    // x=3, y=3
    ch_mag_18_over_42_with_sigma2 = _mm_mulhi_epi16(ch_mag_des,NINE_OVER_FOUR_SQRT_42);
    ch_mag_18_over_42_with_sigma2 = _mm_slli_epi16(ch_mag_18_over_42_with_sigma2,1);
    // x=3, y=5
    ch_mag_34_over_42_with_sigma2 = _mm_mulhi_epi16(ch_mag_des,SEVENTEEN_OVER_FOUR_SQRT_42);
    ch_mag_34_over_42_with_sigma2 = _mm_slli_epi16(ch_mag_34_over_42_with_sigma2,1);
    // x=3, y=7
    ch_mag_58_over_42_with_sigma2 = _mm_mulhi_epi16(ch_mag_des,TWENTYNINE_OVER_FOUR_SQRT_42);
    ch_mag_58_over_42_with_sigma2 = _mm_slli_epi16(ch_mag_58_over_42_with_sigma2,2);
    // x=5, y=5
    ch_mag_50_over_42_with_sigma2 = _mm_mulhi_epi16(ch_mag_des,TWENTYFIVE_OVER_FOUR_SQRT_42);
    ch_mag_50_over_42_with_sigma2 = _mm_slli_epi16(ch_mag_50_over_42_with_sigma2,1);
    // x=5, y=7
    ch_mag_74_over_42_with_sigma2 = _mm_mulhi_epi16(ch_mag_des,THIRTYSEVEN_OVER_FOUR_SQRT_42);
    ch_mag_74_over_42_with_sigma2 = _mm_slli_epi16(ch_mag_74_over_42_with_sigma2,2);
    // x=7, y=7
    ch_mag_98_over_42_with_sigma2 = _mm_mulhi_epi16(ch_mag_des,FORTYNINE_OVER_FOUR_SQRT_42);
    ch_mag_98_over_42_with_sigma2 = _mm_slli_epi16(ch_mag_98_over_42_with_sigma2,2);

    // Computing Metrics
    xmm0 = _mm_subs_epi16(psi_a_p7_p7, a_sq_p7_p7);
    xmm1 = _mm_adds_epi16(xmm0, y0_p_7_7);
    bit_met_p7_p7 = _mm_subs_epi16(xmm1, ch_mag_98_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p7_p5, a_sq_p7_p5);
    xmm1 = _mm_adds_epi16(xmm0, y0_p_7_5);
    bit_met_p7_p5 = _mm_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p7_p3, a_sq_p7_p3);
    xmm1 = _mm_adds_epi16(xmm0, y0_p_7_3);
    bit_met_p7_p3 = _mm_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p7_p1, a_sq_p7_p1);
    xmm1 = _mm_adds_epi16(xmm0, y0_p_7_1);
    bit_met_p7_p1 = _mm_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p7_m1, a_sq_p7_m1);
    xmm1 = _mm_adds_epi16(xmm0, y0_m_7_1);
    bit_met_p7_m1 = _mm_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p7_m3, a_sq_p7_m3);
    xmm1 = _mm_adds_epi16(xmm0, y0_m_7_3);
    bit_met_p7_m3 = _mm_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p7_m5, a_sq_p7_m5);
    xmm1 = _mm_adds_epi16(xmm0, y0_m_7_5);
    bit_met_p7_m5 = _mm_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p7_m7, a_sq_p7_m7);
    xmm1 = _mm_adds_epi16(xmm0, y0_m_7_7);
    bit_met_p7_m7 = _mm_subs_epi16(xmm1, ch_mag_98_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p5_p7, a_sq_p5_p7);
    xmm1 = _mm_adds_epi16(xmm0, y0_p_5_7);
    bit_met_p5_p7 = _mm_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p5_p5, a_sq_p5_p5);
    xmm1 = _mm_adds_epi16(xmm0, y0_p_5_5);
    bit_met_p5_p5 = _mm_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p5_p3, a_sq_p5_p3);
    xmm1 = _mm_adds_epi16(xmm0, y0_p_5_3);
    bit_met_p5_p3 = _mm_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p5_p1, a_sq_p5_p1);
    xmm1 = _mm_adds_epi16(xmm0, y0_p_5_1);
    bit_met_p5_p1 = _mm_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p5_m1, a_sq_p5_m1);
    xmm1 = _mm_adds_epi16(xmm0, y0_m_5_1);
    bit_met_p5_m1 = _mm_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p5_m3, a_sq_p5_m3);
    xmm1 = _mm_adds_epi16(xmm0, y0_m_5_3);
    bit_met_p5_m3 = _mm_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p5_m5, a_sq_p5_m5);
    xmm1 = _mm_adds_epi16(xmm0, y0_m_5_5);
    bit_met_p5_m5 = _mm_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p5_m7, a_sq_p5_m7);
    xmm1 = _mm_adds_epi16(xmm0, y0_m_5_7);
    bit_met_p5_m7 = _mm_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p3_p7, a_sq_p3_p7);
    xmm1 = _mm_adds_epi16(xmm0, y0_p_3_7);
    bit_met_p3_p7 = _mm_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p3_p5, a_sq_p3_p5);
    xmm1 = _mm_adds_epi16(xmm0, y0_p_3_5);
    bit_met_p3_p5 = _mm_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p3_p3, a_sq_p3_p3);
    xmm1 = _mm_adds_epi16(xmm0, y0_p_3_3);
    bit_met_p3_p3 = _mm_subs_epi16(xmm1, ch_mag_18_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p3_p1, a_sq_p3_p1);
    xmm1 = _mm_adds_epi16(xmm0, y0_p_3_1);
    bit_met_p3_p1 = _mm_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p3_m1, a_sq_p3_m1);
    xmm1 = _mm_adds_epi16(xmm0, y0_m_3_1);
    bit_met_p3_m1 = _mm_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p3_m3, a_sq_p3_m3);
    xmm1 = _mm_adds_epi16(xmm0, y0_m_3_3);
    bit_met_p3_m3 = _mm_subs_epi16(xmm1, ch_mag_18_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p3_m5, a_sq_p3_m5);
    xmm1 = _mm_adds_epi16(xmm0, y0_m_3_5);
    bit_met_p3_m5 = _mm_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p3_m7, a_sq_p3_m7);
    xmm1 = _mm_adds_epi16(xmm0, y0_m_3_7);
    bit_met_p3_m7 = _mm_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p1_p7, a_sq_p1_p7);
    xmm1 = _mm_adds_epi16(xmm0, y0_p_1_7);
    bit_met_p1_p7 = _mm_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p1_p5, a_sq_p1_p5);
    xmm1 = _mm_adds_epi16(xmm0, y0_p_1_5);
    bit_met_p1_p5 = _mm_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p1_p3, a_sq_p1_p3);
    xmm1 = _mm_adds_epi16(xmm0, y0_p_1_3);
    bit_met_p1_p3 = _mm_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p1_p1, a_sq_p1_p1);
    xmm1 = _mm_adds_epi16(xmm0, y0_p_1_1);
    bit_met_p1_p1 = _mm_subs_epi16(xmm1, ch_mag_2_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p1_m1, a_sq_p1_m1);
    xmm1 = _mm_adds_epi16(xmm0, y0_m_1_1);
    bit_met_p1_m1 = _mm_subs_epi16(xmm1, ch_mag_2_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p1_m3, a_sq_p1_m3);
    xmm1 = _mm_adds_epi16(xmm0, y0_m_1_3);
    bit_met_p1_m3 = _mm_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p1_m5, a_sq_p1_m5);
    xmm1 = _mm_adds_epi16(xmm0, y0_m_1_5);
    bit_met_p1_m5 = _mm_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_p1_m7, a_sq_p1_m7);
    xmm1 = _mm_adds_epi16(xmm0, y0_m_1_7);
    bit_met_p1_m7 = _mm_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);

    xmm0 = _mm_subs_epi16(psi_a_m1_p7, a_sq_m1_p7);
    xmm1 = _mm_subs_epi16(xmm0, y0_m_1_7);
    bit_met_m1_p7 = _mm_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m1_p5, a_sq_m1_p5);
    xmm1 = _mm_subs_epi16(xmm0, y0_m_1_5);
    bit_met_m1_p5 = _mm_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m1_p3, a_sq_m1_p3);
    xmm1 = _mm_subs_epi16(xmm0, y0_m_1_3);
    bit_met_m1_p3 = _mm_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m1_p1, a_sq_m1_p1);
    xmm1 = _mm_subs_epi16(xmm0, y0_m_1_1);
    bit_met_m1_p1 = _mm_subs_epi16(xmm1, ch_mag_2_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m1_m1, a_sq_m1_m1);
    xmm1 = _mm_subs_epi16(xmm0, y0_p_1_1);
    bit_met_m1_m1 = _mm_subs_epi16(xmm1, ch_mag_2_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m1_m3, a_sq_m1_m3);
    xmm1 = _mm_subs_epi16(xmm0, y0_p_1_3);
    bit_met_m1_m3 = _mm_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m1_m5, a_sq_m1_m5);
    xmm1 = _mm_subs_epi16(xmm0, y0_p_1_5);
    bit_met_m1_m5 = _mm_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m1_m7, a_sq_m1_m7);
    xmm1 = _mm_subs_epi16(xmm0, y0_p_1_7);
    bit_met_m1_m7 = _mm_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m3_p7, a_sq_m3_p7);
    xmm1 = _mm_subs_epi16(xmm0, y0_m_3_7);
    bit_met_m3_p7 = _mm_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m3_p5, a_sq_m3_p5);
    xmm1 = _mm_subs_epi16(xmm0, y0_m_3_5);
    bit_met_m3_p5 = _mm_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m3_p3, a_sq_m3_p3);
    xmm1 = _mm_subs_epi16(xmm0, y0_m_3_3);
    bit_met_m3_p3 = _mm_subs_epi16(xmm1, ch_mag_18_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m3_p1, a_sq_m3_p1);
    xmm1 = _mm_subs_epi16(xmm0, y0_m_3_1);
    bit_met_m3_p1 = _mm_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m3_m1, a_sq_m3_m1);
    xmm1 = _mm_subs_epi16(xmm0, y0_p_3_1);
    bit_met_m3_m1 = _mm_subs_epi16(xmm1, ch_mag_10_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m3_m3, a_sq_m3_m3);
    xmm1 = _mm_subs_epi16(xmm0, y0_p_3_3);
    bit_met_m3_m3 = _mm_subs_epi16(xmm1, ch_mag_18_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m3_m5, a_sq_m3_m5);
    xmm1 = _mm_subs_epi16(xmm0, y0_p_3_5);
    bit_met_m3_m5 = _mm_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m3_m7, a_sq_m3_m7);
    xmm1 = _mm_subs_epi16(xmm0, y0_p_3_7);
    bit_met_m3_m7 = _mm_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m5_p7, a_sq_m5_p7);
    xmm1 = _mm_subs_epi16(xmm0, y0_m_5_7);
    bit_met_m5_p7 = _mm_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m5_p5, a_sq_m5_p5);
    xmm1 = _mm_subs_epi16(xmm0, y0_m_5_5);
    bit_met_m5_p5 = _mm_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m5_p3, a_sq_m5_p3);
    xmm1 = _mm_subs_epi16(xmm0, y0_m_5_3);
    bit_met_m5_p3 = _mm_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m5_p1, a_sq_m5_p1);
    xmm1 = _mm_subs_epi16(xmm0, y0_m_5_1);
    bit_met_m5_p1 = _mm_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m5_m1, a_sq_m5_m1);
    xmm1 = _mm_subs_epi16(xmm0, y0_p_5_1);
    bit_met_m5_m1 = _mm_subs_epi16(xmm1, ch_mag_26_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m5_m3, a_sq_m5_m3);
    xmm1 = _mm_subs_epi16(xmm0, y0_p_5_3);
    bit_met_m5_m3 = _mm_subs_epi16(xmm1, ch_mag_34_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m5_m5, a_sq_m5_m5);
    xmm1 = _mm_subs_epi16(xmm0, y0_p_5_5);
    bit_met_m5_m5 = _mm_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m5_m7, a_sq_m5_m7);
    xmm1 = _mm_subs_epi16(xmm0, y0_p_5_7);
    bit_met_m5_m7 = _mm_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m7_p7, a_sq_m7_p7);
    xmm1 = _mm_subs_epi16(xmm0, y0_m_7_7);
    bit_met_m7_p7 = _mm_subs_epi16(xmm1, ch_mag_98_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m7_p5, a_sq_m7_p5);
    xmm1 = _mm_subs_epi16(xmm0, y0_m_7_5);
    bit_met_m7_p5 = _mm_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m7_p3, a_sq_m7_p3);
    xmm1 = _mm_subs_epi16(xmm0, y0_m_7_3);
    bit_met_m7_p3 = _mm_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m7_p1, a_sq_m7_p1);
    xmm1 = _mm_subs_epi16(xmm0, y0_m_7_1);
    bit_met_m7_p1 = _mm_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m7_m1, a_sq_m7_m1);
    xmm1 = _mm_subs_epi16(xmm0, y0_p_7_1);
    bit_met_m7_m1 = _mm_subs_epi16(xmm1, ch_mag_50_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m7_m3, a_sq_m7_m3);
    xmm1 = _mm_subs_epi16(xmm0, y0_p_7_3);
    bit_met_m7_m3 = _mm_subs_epi16(xmm1, ch_mag_58_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m7_m5, a_sq_m7_m5);
    xmm1 = _mm_subs_epi16(xmm0, y0_p_7_5);
    bit_met_m7_m5 = _mm_subs_epi16(xmm1, ch_mag_74_over_42_with_sigma2);
    xmm0 = _mm_subs_epi16(psi_a_m7_m7, a_sq_m7_m7);
    xmm1 = _mm_subs_epi16(xmm0, y0_p_7_7);
    bit_met_m7_m7 = _mm_subs_epi16(xmm1, ch_mag_98_over_42_with_sigma2);

    // Detection for 1st bit (LTE mapping)
    // bit = 1
    xmm0 = _mm_max_epi16(bit_met_m7_p7, bit_met_m7_p5);
    xmm1 = _mm_max_epi16(bit_met_m7_p3, bit_met_m7_p1);
    xmm2 = _mm_max_epi16(bit_met_m7_m1, bit_met_m7_m3);
    xmm3 = _mm_max_epi16(bit_met_m7_m5, bit_met_m7_m7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(xmm4, xmm5);
    xmm0 = _mm_max_epi16(bit_met_m5_p7, bit_met_m5_p5);
    xmm1 = _mm_max_epi16(bit_met_m5_p3, bit_met_m5_p1);
    xmm2 = _mm_max_epi16(bit_met_m5_m1, bit_met_m5_m3);
    xmm3 = _mm_max_epi16(bit_met_m5_m5, bit_met_m5_m7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_m3_p7, bit_met_m3_p5);
    xmm1 = _mm_max_epi16(bit_met_m3_p3, bit_met_m3_p1);
    xmm2 = _mm_max_epi16(bit_met_m3_m1, bit_met_m3_m3);
    xmm3 = _mm_max_epi16(bit_met_m3_m5, bit_met_m3_m7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_m1_p7, bit_met_m1_p5);
    xmm1 = _mm_max_epi16(bit_met_m1_p3, bit_met_m1_p1);
    xmm2 = _mm_max_epi16(bit_met_m1_m1, bit_met_m1_m3);
    xmm3 = _mm_max_epi16(bit_met_m1_m5, bit_met_m1_m7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);

    // bit = 0
    xmm0 = _mm_max_epi16(bit_met_p7_p7, bit_met_p7_p5);
    xmm1 = _mm_max_epi16(bit_met_p7_p3, bit_met_p7_p1);
    xmm2 = _mm_max_epi16(bit_met_p7_m1, bit_met_p7_m3);
    xmm3 = _mm_max_epi16(bit_met_p7_m5, bit_met_p7_m7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(xmm4, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p5_p7, bit_met_p5_p5);
    xmm1 = _mm_max_epi16(bit_met_p5_p3, bit_met_p5_p1);
    xmm2 = _mm_max_epi16(bit_met_p5_m1, bit_met_p5_m3);
    xmm3 = _mm_max_epi16(bit_met_p5_m5, bit_met_p5_m7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p3_p7, bit_met_p3_p5);
    xmm1 = _mm_max_epi16(bit_met_p3_p3, bit_met_p3_p1);
    xmm2 = _mm_max_epi16(bit_met_p3_m1, bit_met_p3_m3);
    xmm3 = _mm_max_epi16(bit_met_p3_m5, bit_met_p3_m7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p1_p7, bit_met_p1_p5);
    xmm1 = _mm_max_epi16(bit_met_p1_p3, bit_met_p1_p1);
    xmm2 = _mm_max_epi16(bit_met_p1_m1, bit_met_p1_m3);
    xmm3 = _mm_max_epi16(bit_met_p1_m5, bit_met_p1_m7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);

    y0r = _mm_subs_epi16(logmax_num_re0, logmax_den_re0);

    // Detection for 2nd bit (LTE mapping)
    // bit = 1
    xmm0 = _mm_max_epi16(bit_met_p7_m1, bit_met_p5_m1);
    xmm1 = _mm_max_epi16(bit_met_p3_m1, bit_met_p1_m1);
    xmm2 = _mm_max_epi16(bit_met_m1_m1, bit_met_m3_m1);
    xmm3 = _mm_max_epi16(bit_met_m5_m1, bit_met_m7_m1);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(xmm4, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_m3, bit_met_p5_m3);
    xmm1 = _mm_max_epi16(bit_met_p3_m3, bit_met_p1_m3);
    xmm2 = _mm_max_epi16(bit_met_m1_m3, bit_met_m3_m3);
    xmm3 = _mm_max_epi16(bit_met_m5_m3, bit_met_m7_m3);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_m5, bit_met_p5_m5);
    xmm1 = _mm_max_epi16(bit_met_p3_m5, bit_met_p1_m5);
    xmm2 = _mm_max_epi16(bit_met_m1_m5, bit_met_m3_m5);
    xmm3 = _mm_max_epi16(bit_met_m5_m5, bit_met_m7_m5);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_m7, bit_met_p5_m7);
    xmm1 = _mm_max_epi16(bit_met_p3_m7, bit_met_p1_m7);
    xmm2 = _mm_max_epi16(bit_met_m1_m7, bit_met_m3_m7);
    xmm3 = _mm_max_epi16(bit_met_m5_m7, bit_met_m7_m7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);

    // bit = 0
    xmm0 = _mm_max_epi16(bit_met_p7_p1, bit_met_p5_p1);
    xmm1 = _mm_max_epi16(bit_met_p3_p1, bit_met_p1_p1);
    xmm2 = _mm_max_epi16(bit_met_m1_p1, bit_met_m3_p1);
    xmm3 = _mm_max_epi16(bit_met_m5_p1, bit_met_m7_p1);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(xmm4, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_p3, bit_met_p5_p3);
    xmm1 = _mm_max_epi16(bit_met_p3_p3, bit_met_p1_p3);
    xmm2 = _mm_max_epi16(bit_met_m1_p3, bit_met_m3_p3);
    xmm3 = _mm_max_epi16(bit_met_m5_p3, bit_met_m7_p3);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_p5, bit_met_p5_p5);
    xmm1 = _mm_max_epi16(bit_met_p3_p5, bit_met_p1_p5);
    xmm2 = _mm_max_epi16(bit_met_m1_p5, bit_met_m3_p5);
    xmm3 = _mm_max_epi16(bit_met_m5_p5, bit_met_m7_p5);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_p7, bit_met_p5_p7);
    xmm1 = _mm_max_epi16(bit_met_p3_p7, bit_met_p1_p7);
    xmm2 = _mm_max_epi16(bit_met_m1_p7, bit_met_m3_p7);
    xmm3 = _mm_max_epi16(bit_met_m5_p7, bit_met_m7_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);

    y1r = _mm_subs_epi16(logmax_num_re0, logmax_den_re0);

    // Detection for 3rd bit (LTE mapping)
    xmm0 = _mm_max_epi16(bit_met_m7_m7, bit_met_m7_m5);
    xmm1 = _mm_max_epi16(bit_met_m7_m3, bit_met_m7_m1);
    xmm2 = _mm_max_epi16(bit_met_m7_p1, bit_met_m7_p3);
    xmm3 = _mm_max_epi16(bit_met_m7_p5, bit_met_m7_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(xmm4, xmm5);
    xmm0 = _mm_max_epi16(bit_met_m5_m7, bit_met_m5_m5);
    xmm1 = _mm_max_epi16(bit_met_m5_m3, bit_met_m5_m1);
    xmm2 = _mm_max_epi16(bit_met_m5_p1, bit_met_m5_p3);
    xmm3 = _mm_max_epi16(bit_met_m5_p5, bit_met_m5_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p5_m7, bit_met_p5_m5);
    xmm1 = _mm_max_epi16(bit_met_p5_m3, bit_met_p5_m1);
    xmm2 = _mm_max_epi16(bit_met_p5_p1, bit_met_p5_p3);
    xmm3 = _mm_max_epi16(bit_met_p5_p5, bit_met_p5_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_m7, bit_met_p7_m5);
    xmm1 = _mm_max_epi16(bit_met_p7_m3, bit_met_p7_m1);
    xmm2 = _mm_max_epi16(bit_met_p7_p1, bit_met_p7_p3);
    xmm3 = _mm_max_epi16(bit_met_p7_p5, bit_met_p7_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);

    xmm0 = _mm_max_epi16(bit_met_m3_m7, bit_met_m3_m5);
    xmm1 = _mm_max_epi16(bit_met_m3_m3, bit_met_m3_m1);
    xmm2 = _mm_max_epi16(bit_met_m3_p1, bit_met_m3_p3);
    xmm3 = _mm_max_epi16(bit_met_m3_p5, bit_met_m3_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(xmm4, xmm5);
    xmm0 = _mm_max_epi16(bit_met_m1_m7, bit_met_m1_m5);
    xmm1 = _mm_max_epi16(bit_met_m1_m3, bit_met_m1_m1);
    xmm2 = _mm_max_epi16(bit_met_m1_p1, bit_met_m1_p3);
    xmm3 = _mm_max_epi16(bit_met_m1_p5, bit_met_m1_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p1_m7, bit_met_p1_m5);
    xmm1 = _mm_max_epi16(bit_met_p1_m3, bit_met_p1_m1);
    xmm2 = _mm_max_epi16(bit_met_p1_p1, bit_met_p1_p3);
    xmm3 = _mm_max_epi16(bit_met_p1_p5, bit_met_p1_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p3_m7, bit_met_p3_m5);
    xmm1 = _mm_max_epi16(bit_met_p3_m3, bit_met_p3_m1);
    xmm2 = _mm_max_epi16(bit_met_p3_p1, bit_met_p3_p3);
    xmm3 = _mm_max_epi16(bit_met_p3_p5, bit_met_p3_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);

    y2r = _mm_subs_epi16(logmax_num_re0, logmax_den_re0);

    // Detection for 4th bit (LTE mapping)
    xmm0 = _mm_max_epi16(bit_met_p7_p7, bit_met_p5_p7);
    xmm1 = _mm_max_epi16(bit_met_p3_p7, bit_met_p1_p7);
    xmm2 = _mm_max_epi16(bit_met_m1_p7, bit_met_m3_p7);
    xmm3 = _mm_max_epi16(bit_met_m5_p7, bit_met_m7_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(xmm4, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_p5, bit_met_p5_p5);
    xmm1 = _mm_max_epi16(bit_met_p3_p5, bit_met_p1_p5);
    xmm2 = _mm_max_epi16(bit_met_m1_p5, bit_met_m3_p5);
    xmm3 = _mm_max_epi16(bit_met_m5_p5, bit_met_m5_p5);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_m5, bit_met_p5_m5);
    xmm1 = _mm_max_epi16(bit_met_p3_m5, bit_met_p1_m5);
    xmm2 = _mm_max_epi16(bit_met_m1_m5, bit_met_m3_m5);
    xmm3 = _mm_max_epi16(bit_met_m5_m5, bit_met_m7_m5);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_m7, bit_met_p5_m7);
    xmm1 = _mm_max_epi16(bit_met_p3_m7, bit_met_p1_m7);
    xmm2 = _mm_max_epi16(bit_met_m1_m7, bit_met_m3_m7);
    xmm3 = _mm_max_epi16(bit_met_m5_m7, bit_met_m7_m7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);

    xmm0 = _mm_max_epi16(bit_met_p7_m1, bit_met_p5_m1);
    xmm1 = _mm_max_epi16(bit_met_p3_m1, bit_met_p1_m1);
    xmm2 = _mm_max_epi16(bit_met_m1_m1, bit_met_m3_m1);
    xmm3 = _mm_max_epi16(bit_met_m5_m1, bit_met_m7_m1);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(xmm4, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_m3, bit_met_p5_m3);
    xmm1 = _mm_max_epi16(bit_met_p3_m3, bit_met_p1_m3);
    xmm2 = _mm_max_epi16(bit_met_m1_m3, bit_met_m3_m3);
    xmm3 = _mm_max_epi16(bit_met_m5_m3, bit_met_m7_m3);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_p1, bit_met_p5_p1);
    xmm1 = _mm_max_epi16(bit_met_p3_p1, bit_met_p1_p1);
    xmm2 = _mm_max_epi16(bit_met_m1_p1, bit_met_m3_p1);
    xmm3 = _mm_max_epi16(bit_met_m5_p1, bit_met_m7_p1);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_p3, bit_met_p5_p3);
    xmm1 = _mm_max_epi16(bit_met_p3_p3, bit_met_p1_p3);
    xmm2 = _mm_max_epi16(bit_met_m1_p3, bit_met_m3_p3);
    xmm3 = _mm_max_epi16(bit_met_m5_p3, bit_met_m7_p3);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);

    y0i = _mm_subs_epi16(logmax_num_re0, logmax_den_re0);


    // Detection for 5th bit (LTE mapping)
    xmm0 = _mm_max_epi16(bit_met_m7_m7, bit_met_m7_m5);
    xmm1 = _mm_max_epi16(bit_met_m7_m3, bit_met_m7_m1);
    xmm2 = _mm_max_epi16(bit_met_m7_p1, bit_met_m7_p3);
    xmm3 = _mm_max_epi16(bit_met_m7_p5, bit_met_m7_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(xmm4, xmm5);
    xmm0 = _mm_max_epi16(bit_met_m1_m7, bit_met_m1_m5);
    xmm1 = _mm_max_epi16(bit_met_m1_m3, bit_met_m1_m1);
    xmm2 = _mm_max_epi16(bit_met_m1_p1, bit_met_m1_p3);
    xmm3 = _mm_max_epi16(bit_met_m1_p5, bit_met_m1_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p1_m7, bit_met_p1_m5);
    xmm1 = _mm_max_epi16(bit_met_p1_m3, bit_met_p1_m1);
    xmm2 = _mm_max_epi16(bit_met_p1_p1, bit_met_p1_p3);
    xmm3 = _mm_max_epi16(bit_met_p1_p5, bit_met_p1_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_m7, bit_met_p7_m5);
    xmm1 = _mm_max_epi16(bit_met_p7_m3, bit_met_p7_m1);
    xmm2 = _mm_max_epi16(bit_met_p7_p1, bit_met_p7_p3);
    xmm3 = _mm_max_epi16(bit_met_p7_p5, bit_met_p7_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);

    xmm0 = _mm_max_epi16(bit_met_m5_m7, bit_met_m5_m5);
    xmm1 = _mm_max_epi16(bit_met_m5_m3, bit_met_m5_m1);
    xmm2 = _mm_max_epi16(bit_met_m5_p1, bit_met_m5_p3);
    xmm3 = _mm_max_epi16(bit_met_m5_p5, bit_met_m5_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(xmm4, xmm5);
    xmm0 = _mm_max_epi16(bit_met_m3_m7, bit_met_m3_m5);
    xmm1 = _mm_max_epi16(bit_met_m3_m3, bit_met_m3_m1);
    xmm2 = _mm_max_epi16(bit_met_m3_p1, bit_met_m3_p3);
    xmm3 = _mm_max_epi16(bit_met_m3_p5, bit_met_m3_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p3_m7, bit_met_p3_m5);
    xmm1 = _mm_max_epi16(bit_met_p3_m3, bit_met_p3_m1);
    xmm2 = _mm_max_epi16(bit_met_p3_p1, bit_met_p3_p3);
    xmm3 = _mm_max_epi16(bit_met_p3_p5, bit_met_p3_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p5_m7, bit_met_p5_m5);
    xmm1 = _mm_max_epi16(bit_met_p5_m3, bit_met_p5_m1);
    xmm2 = _mm_max_epi16(bit_met_p5_p1, bit_met_p5_p3);
    xmm3 = _mm_max_epi16(bit_met_p5_p5, bit_met_p5_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);

    y1i = _mm_subs_epi16(logmax_num_re0, logmax_den_re0);

    // Detection for 6th bit (LTE mapping)
    xmm0 = _mm_max_epi16(bit_met_p7_p7, bit_met_p5_p7);
    xmm1 = _mm_max_epi16(bit_met_p3_p7, bit_met_p1_p7);
    xmm2 = _mm_max_epi16(bit_met_m1_p7, bit_met_m3_p7);
    xmm3 = _mm_max_epi16(bit_met_m5_p7, bit_met_m7_p7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(xmm4, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_p1, bit_met_p5_p1);
    xmm1 = _mm_max_epi16(bit_met_p3_p1, bit_met_p1_p1);
    xmm2 = _mm_max_epi16(bit_met_m1_p1, bit_met_m3_p1);
    xmm3 = _mm_max_epi16(bit_met_m5_p1, bit_met_m5_p1);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_m1, bit_met_p5_m1);
    xmm1 = _mm_max_epi16(bit_met_p3_m1, bit_met_p1_m1);
    xmm2 = _mm_max_epi16(bit_met_m1_m1, bit_met_m3_m1);
    xmm3 = _mm_max_epi16(bit_met_m5_m1, bit_met_m7_m1);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_m7, bit_met_p5_m7);
    xmm1 = _mm_max_epi16(bit_met_p3_m7, bit_met_p1_m7);
    xmm2 = _mm_max_epi16(bit_met_m1_m7, bit_met_m3_m7);
    xmm3 = _mm_max_epi16(bit_met_m5_m7, bit_met_m7_m7);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm4);
    logmax_den_re0 = _mm_max_epi16(logmax_den_re0, xmm5);

    xmm0 = _mm_max_epi16(bit_met_p7_m5, bit_met_p5_m5);
    xmm1 = _mm_max_epi16(bit_met_p3_m5, bit_met_p1_m5);
    xmm2 = _mm_max_epi16(bit_met_m1_m5, bit_met_m3_m5);
    xmm3 = _mm_max_epi16(bit_met_m5_m5, bit_met_m7_m5);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(xmm4, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_m3, bit_met_p5_m3);
    xmm1 = _mm_max_epi16(bit_met_p3_m3, bit_met_p1_m3);
    xmm2 = _mm_max_epi16(bit_met_m1_m3, bit_met_m3_m3);
    xmm3 = _mm_max_epi16(bit_met_m5_m3, bit_met_m7_m3);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_p3, bit_met_p5_p3);
    xmm1 = _mm_max_epi16(bit_met_p3_p3, bit_met_p1_p3);
    xmm2 = _mm_max_epi16(bit_met_m1_p3, bit_met_m3_p3);
    xmm3 = _mm_max_epi16(bit_met_m5_p3, bit_met_m7_p3);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);
    xmm0 = _mm_max_epi16(bit_met_p7_p5, bit_met_p5_p5);
    xmm1 = _mm_max_epi16(bit_met_p3_p5, bit_met_p1_p5);
    xmm2 = _mm_max_epi16(bit_met_m1_p5, bit_met_m3_p5);
    xmm3 = _mm_max_epi16(bit_met_m5_p5, bit_met_m7_p5);
    xmm4 = _mm_max_epi16(xmm0, xmm1);
    xmm5 = _mm_max_epi16(xmm2, xmm3);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm4);
    logmax_num_re0 = _mm_max_epi16(logmax_num_re0, xmm5);

    y2i = _mm_subs_epi16(logmax_num_re0, logmax_den_re0);


    // map to output stream, difficult to do in SIMD since we have 6 16bit LLRs
    // RE 1
    j = 24*i;
    stream0_out[j + 0] = ((short *)&y0r)[0];
    stream0_out[j + 1] = ((short *)&y1r)[0];
    stream0_out[j + 2] = ((short *)&y2r)[0];
    stream0_out[j + 3] = ((short *)&y0i)[0];
    stream0_out[j + 4] = ((short *)&y1i)[0];
    stream0_out[j + 5] = ((short *)&y2i)[0];
    // RE 2
    stream0_out[j + 6] = ((short *)&y0r)[1];
    stream0_out[j + 7] = ((short *)&y1r)[1];
    stream0_out[j + 8] = ((short *)&y2r)[1];
    stream0_out[j + 9] = ((short *)&y0i)[1];
    stream0_out[j + 10] = ((short *)&y1i)[1];
    stream0_out[j + 11] = ((short *)&y2i)[1];
    // RE 3
    stream0_out[j + 12] = ((short *)&y0r)[2];
    stream0_out[j + 13] = ((short *)&y1r)[2];
    stream0_out[j + 14] = ((short *)&y2r)[2];
    stream0_out[j + 15] = ((short *)&y0i)[2];
    stream0_out[j + 16] = ((short *)&y1i)[2];
    stream0_out[j + 17] = ((short *)&y2i)[2];
    // RE 4
    stream0_out[j + 18] = ((short *)&y0r)[3];
    stream0_out[j + 19] = ((short *)&y1r)[3];
    stream0_out[j + 20] = ((short *)&y2r)[3];
    stream0_out[j + 21] = ((short *)&y0i)[3];
    stream0_out[j + 22] = ((short *)&y1i)[3];
    stream0_out[j + 23] = ((short *)&y2i)[3];
    // RE 5
    stream0_out[j + 24] = ((short *)&y0r)[4];
    stream0_out[j + 25] = ((short *)&y1r)[4];
    stream0_out[j + 26] = ((short *)&y2r)[4];
    stream0_out[j + 27] = ((short *)&y0i)[4];
    stream0_out[j + 28] = ((short *)&y1i)[4];
    stream0_out[j + 29] = ((short *)&y2i)[4];
    // RE 6
    stream0_out[j + 30] = ((short *)&y0r)[5];
    stream0_out[j + 31] = ((short *)&y1r)[5];
    stream0_out[j + 32] = ((short *)&y2r)[5];
    stream0_out[j + 33] = ((short *)&y0i)[5];
    stream0_out[j + 34] = ((short *)&y1i)[5];
    stream0_out[j + 35] = ((short *)&y2i)[5];
    // RE 7
    stream0_out[j + 36] = ((short *)&y0r)[6];
    stream0_out[j + 37] = ((short *)&y1r)[6];
    stream0_out[j + 38] = ((short *)&y2r)[6];
    stream0_out[j + 39] = ((short *)&y0i)[6];
    stream0_out[j + 40] = ((short *)&y1i)[6];
    stream0_out[j + 41] = ((short *)&y2i)[6];
    // RE 8
    stream0_out[j + 42] = ((short *)&y0r)[7];
    stream0_out[j + 43] = ((short *)&y1r)[7];
    stream0_out[j + 44] = ((short *)&y2r)[7];
    stream0_out[j + 45] = ((short *)&y0i)[7];
    stream0_out[j + 46] = ((short *)&y1i)[7];
    stream0_out[j + 47] = ((short *)&y2i)[7];

#elif defined(__arm__)

#endif

  }

#if defined(__x86_64__) || defined(__i386__)
  _mm_empty();
  _m_empty();
#endif
}
#endif


int nr_dlsch_64qam_64qam_llr(NR_DL_FRAME_PARMS *frame_parms,
                          int32_t **rxdataF_comp,
                          int32_t **rxdataF_comp_i,
                          int32_t **dl_ch_mag,
                          int32_t **dl_ch_mag_i,
                          int32_t **rho_i,
                          int16_t *dlsch_llr,
                          uint8_t symbol,
						  uint32_t len,
                          uint8_t first_symbol_flag,
                          uint16_t nb_rb,
                          uint16_t pbch_pss_sss_adjust,
                          //int16_t **llr16p,
                          uint32_t llr_offset)
{

  int16_t *rxF      = (int16_t*)&rxdataF_comp[0][(symbol*nb_rb*12)];
  int16_t *rxF_i    = (int16_t*)&rxdataF_comp_i[0][(symbol*nb_rb*12)];
  int16_t *ch_mag   = (int16_t*)&dl_ch_mag[0][(symbol*nb_rb*12)];
  int16_t *ch_mag_i = (int16_t*)&dl_ch_mag_i[0][(symbol*nb_rb*12)];
  int16_t *rho      = (int16_t*)&rho_i[0][(symbol*nb_rb*12)];
  int16_t *llr16;
  int8_t  *pllr_symbol; // pointer where llrs should filled for this ofdm symbol

  //first symbol has different structure due to more pilots
  /*if (first_symbol_flag == 1) {
    llr16 = (int16_t*)dlsch_llr;
  } else {
    llr16 = (int16_t*)(*llr16p);
  }*/

  llr16 = (int16_t*)dlsch_llr;

  AssertFatal(llr16!=NULL,"nr_dlsch_16qam_64qam_llr:llr is null, symbol %d\n",symbol);


  pllr_symbol = (int8_t*)dlsch_llr;
  pllr_symbol += llr_offset;
  //printf("nr_dlsch_64qam_64qam_llr: symbol %d,nb_rb %d, len %d,pbch_pss_sss_adjust %d\n",symbol,nb_rb,len,pbch_pss_sss_adjust);
  /*LOG_I(PHY,"nr_dlsch_64qam_64qam_llr [symb %d / FirstSym %d / Length %d / LLR Offset %d]: @LLR Buff %x, @LLR Buff(symb) %x, , @Compute LLR Buff(symb) %x  \n",
             symbol,
             first_symbol_flag,
             len,
             llr_offset,
             (int16_t*)dlsch_llr,
             llr16,
             pllr_symbol);*/

#ifdef __AVX2__

  // Round length up to multiple of 16 words
  uint32_t len256i = ((len+16)>>4)*16;
  int32_t *rxF_256i      = (int32_t*) malloc16_clear(len256i*4);
  int32_t *rxF_i_256i    = (int32_t*) malloc16_clear(len256i*4);
  int32_t *ch_mag_256i   = (int32_t*) malloc16_clear(len256i*4);
  int32_t *ch_mag_i_256i = (int32_t*) malloc16_clear(len256i*4);
  int32_t *rho_256i      = (int32_t*) malloc16_clear(len256i*4);

  memcpy(rxF_256i, rxF, len*4);
  memcpy(rxF_i_256i, rxF_i, len*4);
  memcpy(ch_mag_256i, ch_mag, len*4);
  memcpy(ch_mag_i_256i, ch_mag_i, len*4);
  memcpy(rho_256i, rho, len*4);

#if 0
  qam64_qam16_avx2((short *)rxF_256i,
                   (short *)rxF_i_256i,
                   (short *)ch_mag_256i,
                   (short *)ch_mag_i_256i,
                   (short *)llr16,
                   (short *) rho_256i,
                   len);
#else
  qam64_qam64_avx2((int32_t *)rxF_256i,
                   (int32_t *)rxF_i_256i,
                   (int32_t *)ch_mag_256i,
                   (int32_t *)ch_mag_i_256i,
                   (int16_t *)llr16,
                   (int32_t *) rho_256i,
                   len);
#endif
  
  free16(rxF_256i, sizeof(rxF_256i));
  free16(rxF_i_256i, sizeof(rxF_i_256i));
  free16(ch_mag_256i, sizeof(ch_mag_256i));
  free16(ch_mag_i_256i, sizeof(ch_mag_i_256i));
  free16(rho_256i, sizeof(rho_256i));

#else
  qam64_qam64((short *)rxF,
              (short *)rxF_i,
              (short *)ch_mag,
              (short *)ch_mag_i,
              (short *)llr16,
              (short *)rho,
              len);
#endif

  llr16 += (6*len);
  //*llr16p = (short *)llr16;

  return(0);
}