Commit fe1dc24e authored by Bruno Mongazon-Cazavet's avatar Bruno Mongazon-Cazavet Committed by Robert Schmidt

Add Intrinsics usage by SIMDE

parent a1de5e3d
......@@ -178,7 +178,6 @@ if (CUDA_FOUND)
# Disable warnings for CUDA
SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-lpthread;-w;-O3;--default-stream;per-thread;-I/usr/local/cuda/inc;-L/usr/local/cuda/lib -lcutil;-rdc=true;-lcudadevrt")
SET(CUDA_VERBOSE_BUILD ON)
SET(CUDA_HOST_COMPILER "/usr/bin/g++")
SET(CUDA_SEPARABLE_COMPILATION ON)
......@@ -198,44 +197,36 @@ message("CMAKE_BUILD_TYPE is ${CMAKE_BUILD_TYPE}")
add_list_string_option(CMAKE_BUILD_TYPE "RelWithDebInfo" "Choose the type of build, options are: None(CMAKE_CXX_FLAGS or CMAKE_C_FLAGS used) Debug Release RelWithDebInfo MinSizeRel." Debug Release RelWithDebInfo MinSizeRel)
Message("Architecture is ${CMAKE_SYSTEM_PROCESSOR}")
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7l")
set(C_FLAGS_PROCESSOR "-gdwarf-2 -mfloat-abi=hard -mfpu=neon -lgcc -lrt")
else (CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7l")
if(EXISTS "/proc/cpuinfo")
file(STRINGS "/proc/cpuinfo" CPUINFO REGEX flags LIMIT_COUNT 1)
message("NOAVX512 is ${NOAVX512}")
if (CPUINFO MATCHES "avx512bw" AND "${NOAVX512}" STREQUAL "False")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -mavx512bw -march=skylake-avx512 -mtune=skylake-avx512 " )
set(COMPILATION_AVX2 "True")
else()
if (CPUINFO MATCHES "avx2")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -mavx2")
set(COMPILATION_AVX2 "True")
else()
set(COMPILATION_AVX2 "False")
endif()
if (CPUINFO MATCHES "sse4_1")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -msse4.1 -mpclmul")
endif()
if (CPUINFO MATCHES "ssse3")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -mssse3")
endif()
endif()
else()
Message("/proc/cpuinfo does not exit. We will use manual CPU flags")
endif()
# in case /proc/cpuinfo exists we want to inspect available Intrinsics
# -so not to go always through SIMDE emulation
# -so to avoid AVX512 instructions generation by gcc
if(EXISTS "/proc/cpuinfo")
file(STRINGS "/proc/cpuinfo" CPUINFO REGEX flags LIMIT_COUNT 1)
message("AVX512 is ${AVX512}")
message("AVX2 is ${AVX2}")
if ("${AVX512}" STREQUAL "False")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -mno-avx512f")
else()
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -mavx512bw -march=skylake-avx512 -mtune=skylake-avx512")
endif()
if (CPUINFO MATCHES "avx2" AND "${AVX2}" STREQUAL "True")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -DSIMDE_X86_AVX2_NATIVE")
endif()
if (CPUINFO MATCHES "sse4_1")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -DSIMDE_X86_SSE4_1_NATIVE")
endif()
if (CPUINFO MATCHES "sse4_2")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -DSIMDE_X86_SSE4_2_NATIVE")
endif()
if (CPUINFO MATCHES "ssse3")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -DSIMDE_X86_SSSE3_NATIVE")
endif()
endif()
set(C_FLAGS_PROCESSOR " ${C_FLAGS_PROCESSOR} ${CFLAGS_PROCESSOR_USER}")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -fno-var-tracking-assignments -march=native")
Message("C_FLAGS_PROCESSOR is ${C_FLAGS_PROCESSOR}")
#if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86")
# if ( (NOT( C_FLAGS_PROCESSOR MATCHES "ssse3")) OR (NOT( C_FLAGS_PROCESSOR MATCHES "msse4.1")) )
# Message(FATAL_ERROR "For x86 Architecture, you must have following flags: -mssse3 -msse4.1. The current detected flags are: ${C_FLAGS_PROCESSOR}. You can pass the flags manually in build script, for example: ./build_oai --cflags_processor \"-mssse3 -msse4.1 -mavx2\" ")
# endif()
#endif()
#
# add autotools definitions that were maybe used!
......@@ -855,9 +846,6 @@ endif ()
include_directories ("${OPENAIR_DIR}/sdr/COMMON")
Message("DEADLINE_SCHEDULER flag is ${DEADLINE_SCHEDULER}")
Message("CPU_Affinity flag is ${CPU_AFFINITY}")
##############################################################
# ???!!! TO BE DOCUMENTED OPTIONS !!!???
##############################################################
......@@ -1711,14 +1699,8 @@ if (${SMBV})
set(PHY_SRC "${PHY_SRC} ${OPENAIR1_DIR}/PHY/TOOLS/smbv.c")
endif (${SMBV})
if (${COMPILATION_AVX2} STREQUAL "True")
#set(PHY_SRC ${PHY_SRC} ${OPENAIR1_DIR}/PHY/LTE_TRANSPORT/dlsch_llr_computation_avx2.c)
set(PHY_SRC_UE ${PHY_SRC_UE} ${OPENAIR1_DIR}/PHY/LTE_UE_TRANSPORT/dlsch_llr_computation_avx2.c)
endif ()
if (${COMPILATION_AVX2} STREQUAL "True")
set(PHY_NR_UE_SRC ${PHY_NR_UE_SRC} ${OPENAIR1_DIR}/PHY/LTE_UE_TRANSPORT/dlsch_llr_computation_avx2.c)
endif ()
set(PHY_SRC_UE ${PHY_SRC_UE} ${OPENAIR1_DIR}/PHY/LTE_UE_TRANSPORT/dlsch_llr_computation_avx2.c)
set(PHY_NR_UE_SRC ${PHY_NR_UE_SRC} ${OPENAIR1_DIR}/PHY/LTE_UE_TRANSPORT/dlsch_llr_computation_avx2.c)
add_library(PHY_COMMON ${PHY_SRC_COMMON})
add_dependencies(PHY_COMMON rrc_flag)
......@@ -2829,16 +2811,14 @@ add_executable(nr-uesoftmodem
target_link_libraries (nr-uesoftmodem
-Wl,--start-group
RRC_LIB NR_RRC_LIB NGAP_LIB NGAP_GNB SECU_CN SECU_OSA UTIL HASHTABLE SCTP_CLIENT SCHED_RU_LIB SCHED_UE_LIB SCHED_NR_UE_LIB
PHY_COMMON PHY_NR_COMMON PHY_UE PHY_NR_UE PHY_RU NR_L2_UE L2_UE_LTE_NR MAC_NR_COMMON NFAPI_COMMON_LIB NFAPI_LIB NFAPI_PNF_LIB
NFAPI_USER_LIB MISC_NFAPI_NR_LIB S1AP_LIB S1AP_ENB
${RAL_LIB} ${NAS_UE_LIB} ITTI ${FLPT_MSG_LIB} ${ATLAS_LIBRARIES}
NFAPI_USER_LIB S1AP_LIB S1AP_ENB
${RAL_LIB} ${NAS_UE_LIB} ITTI ${FLPT_MSG_LIB} ${ATLAS_LIBRARIES} LIB_5GNAS_GNB LIB_NAS_SIMUE ${NAS_SIM_LIB}
NR_RRC_LIB SECU_CN SECU_OSA UTIL HASHTABLE SCHED_RU_LIB SCHED_NR_UE_LIB
PHY_COMMON PHY_NR_COMMON PHY_NR_UE NR_L2_UE L2_UE_LTE_NR MAC_NR_COMMON NFAPI_COMMON_LIB NFAPI_LIB NFAPI_PNF_LIB
NFAPI_USER_LIB MISC_NFAPI_NR_LIB
${RAL_LIB} ITTI ${ATLAS_LIBRARIES} LIB_5GNAS_GNB LIB_NAS_SIMUE ${NAS_SIM_LIB}
-Wl,--end-group z dl)
target_link_libraries (nr-uesoftmodem ${LIBXML2_LIBRARIES})
target_link_libraries (nr-uesoftmodem pthread m CONFIG_LIB rt crypt ${CRYPTO_LIBRARIES} ${OPENSSL_LIBRARIES} sctp ${XFORMS_LIBRARIES} ${CMAKE_DL_LIBS} ${LIBYAML_LIBRARIES} ${ATLAS_LIBRARIES})
target_link_libraries (nr-uesoftmodem pthread m CONFIG_LIB rt crypt ${CRYPTO_LIBRARIES} ${OPENSSL_LIBRARIES} ${XFORMS_LIBRARIES} ${CMAKE_DL_LIBS} ${LIBYAML_LIBRARIES} ${ATLAS_LIBRARIES})
target_link_libraries (nr-uesoftmodem ${LIB_LMS_LIBRARIES})
target_link_libraries (nr-uesoftmodem ${T_LIB})
......@@ -3126,7 +3106,7 @@ function(make_driver name dir)
endforeach()
CONFIGURE_FILE(${OPENAIR_CMAKE}/tools/Kbuild.cmake ${OPENAIR_BIN_DIR}/${name}/Kbuild)
add_custom_command(OUTPUT ${name}.ko
COMMAND make -j2 -C ${module_build_path} M=${OPENAIR_BIN_DIR}/${name}
COMMAND make -C ${module_build_path} M=${OPENAIR_BIN_DIR}/${name}
WORKING_DIRECTORY ${OPENAIR_BIN_DIR}/${name}
COMMENT "building ${module}.ko"
VERBATIM
......
......@@ -177,7 +177,7 @@ function variant__v1__enb_usrp {
function variant__v2__basic_sim {
NB_PATTERN_FILES=11
BUILD_OPTIONS="--eNB --UE"
BUILD_OPTIONS="--eNB --UE --noavx512"
VM_MEMORY=8192
RUN_OPTIONS="complex"
}
......@@ -201,28 +201,28 @@ function variant__v5__gnb_usrp {
VM_MEMORY=10240
VM_CPU=8
NB_PATTERN_FILES=6
BUILD_OPTIONS="--gNB -w USRP"
BUILD_OPTIONS="--gNB -w USRP --noavx512"
}
function variant__v6__nr_ue_usrp {
VM_MEMORY=4096
VM_CPU=4
NB_PATTERN_FILES=6
BUILD_OPTIONS="--nrUE -w USRP"
BUILD_OPTIONS="--nrUE -w USRP --noavx512"
}
function variant__v7__enb_ethernet {
VM_MEMORY=4096
ARCHIVES_LOC=enb_eth
NB_PATTERN_FILES=6
BUILD_OPTIONS="--eNB -w USRP"
BUILD_OPTIONS="--eNB -w USRP --noavx512"
}
function variant__v8__ue_ethernet {
VM_MEMORY=4096
ARCHIVES_LOC=ue_eth
NB_PATTERN_FILES=10
BUILD_OPTIONS="--UE -w USRP"
BUILD_OPTIONS="--UE -w USRP --noavx512"
}
function variant__v10__flexran_rtc {
......
......@@ -34,7 +34,7 @@
<mode>TesteNB</mode>
<class>Build_eNB</class>
<desc>Build gNB (USRP)</desc>
<Build_eNB_args>--gNB -w USRP --ninja --cmake-opt -DBoost_INCLUDE_DIR=/usr/include/boost169 --noavx512</Build_eNB_args>
<Build_eNB_args>--gNB -w USRP --ninja --cmake-opt -DBoost_INCLUDE_DIR=/usr/include/boost169 --cmake-opt -DCUDA_HOST_COMPILER=/opt/rh/devtoolset-7/root/usr/bin/gcc</Build_eNB_args>
<forced_workspace_cleanup>True</forced_workspace_cleanup>
</testCase>
......
......@@ -55,7 +55,8 @@ BUILD_COVERITY_SCAN=0
DISABLE_HARDWARE_DEPENDENCY="False"
CMAKE_BUILD_TYPE="RelWithDebInfo"
CMAKE_CMD="$CMAKE"
NOAVX512="False"
AVX512="True"
AVX2="True"
BUILD_ECLIPSE=0
NR="False"
OPTIONAL_LIBRARIES="telnetsrv enbscope uescope nrscope"
......@@ -127,11 +128,9 @@ Options:
-x | --xforms
Will compile with software oscilloscope features
--verbose-ci
Compile with verbose instructions in CI Docker env
Compile with verbose instructions in CI Docker env
--verbose-compile
Shows detailed compilation instructions in makefile
--cflags_processor
Manually Add CFLAGS of processor if they are not detected correctly by script. Only add these flags if you know your processor supports them. Example flags: -msse3 -msse4.1 -msse4.2 -mavx2
--build-doxygen
Builds doxygen based documentation.
--build-coverity-scan
......@@ -159,10 +158,10 @@ Options:
Build eclipse project files.
--build-lib <libraries>
Build optional shared library, <libraries> can be one or several of $OPTIONAL_LIBRARIES or \"all\"
--usrp-recplay
Build for I/Q record-playback modes
--noavx512
Build without AVX512 if it is present on CPU
Disable AVX512 intrinsics whatever processor capability is
--noavx2
Disable AVX2 intrinsics if processor supports it or use SIMDE emulation
-k | --skip-shared-libraries
Skip build for shared libraries to reduce compilation time when building frequently for debugging purposes
--ninja
......@@ -349,10 +348,6 @@ function main() {
VERBOSE_COMPILE=1
echo_info "Will compile with verbose instructions"
shift;;
--cflags_processor)
CMAKE_CMD="$CMAKE_CMD -DCFLAGS_PROCESSOR_USER=\"$2\""
echo_info "Setting CPU FLAGS from USER to: $2"
shift 2;;
--build-doxygen)
BUILD_DOXYGEN=1
echo_info "Will build doxygen support"
......@@ -430,8 +425,12 @@ function main() {
fi
shift 2;;
--noavx512)
NOAVX512="True"
echo_info "Disabling AVX512"
AVX512="False"
echo_info "Disabling AVX512 instructions"
shift 1;;
--noavx2)
AVX2="False"
echo_info "Disabling AVX2 instructions"
shift 1;;
-k | --skip-shared-libraries)
SKIP_SHARED_LIB_FLAG="True"
......@@ -616,7 +615,7 @@ function main() {
if [[ ${#CMAKE_CXX_FLAGS[@]} > 0 ]]; then CMAKE_CMD="$CMAKE_CMD -DCMAKE_CXX_FLAGS=\"${CMAKE_CXX_FLAGS[*]}\""; fi
# for historical reasons we build in a subdirectory cmake_targets/XYZ/build,
# e.g., cmake_targets/ran_build/build, hence the ../../..
CMAKE_CMD="$CMAKE_CMD -DNOAVX512=\"${NOAVX512[*]}\" ../../.."
CMAKE_CMD="$CMAKE_CMD -DAVX512=\"${AVX512[*]}\" -DAVX2=\"${AVX2[*]}\" ../../.."
echo_info "running $CMAKE_CMD"
eval $CMAKE_CMD
......
......@@ -122,6 +122,7 @@ check_supported_distribution() {
"rhel8.6") return 0 ;;
"rhel8.7") return 0 ;;
"centos7") return 0 ;;
"centos8") return 0 ;;
esac
return 1
}
......@@ -897,6 +898,7 @@ check_install_oai_software() {
fi
install_asn1c_from_source $1
install_simde_from_source $1
}
install_asn1c_from_source(){
......@@ -923,6 +925,15 @@ install_asn1c_from_source(){
) > $asn1_install_log 2>&1
}
install_simde_from_source(){
echo_info "\nInstalling SIMDE from source without test cases (header files only)"
$SUDO rm -rf /tmp/simde
git clone https://github.com/simd-everywhere/simde-no-tests.git /tmp/simde
cd /tmp/simde
# brute force copy into /usr/include
$SUDO \cp -rv ../simde /usr/include
}
#################################################
# 2. compile
################################################
......
......@@ -37,7 +37,7 @@ extern double cpu_freq_GHz __attribute__ ((aligned(32)));;
// structure to store data to compute cpu measurment
#if defined(__x86_64__) || defined(__i386__)
typedef long long oai_cputime_t;
#elif defined(__arm__)
#elif defined(__arm__) || defined(__aarch64__)
typedef uint32_t oai_cputime_t;
#else
#error "building on unsupported CPU architecture"
......@@ -107,7 +107,7 @@ static inline unsigned long long rdtsc_oai(void) {
return (d<<32) | a;
}
#elif defined(__arm__)
#elif defined(__arm__) || defined(__aarch64__)
static inline uint32_t rdtsc_oai(void) __attribute__((always_inline));
static inline uint32_t rdtsc_oai(void) {
uint32_t r = 0;
......
......@@ -34,11 +34,7 @@ extern "C" {
#endif
#ifndef malloc16
# ifdef __AVX2__
# define malloc16(x) memalign(32,x+32)
# else
# define malloc16(x) memalign(16,x+16)
# endif
#endif
#define free16(y,x) free(y)
#define bigmalloc malloc
......@@ -54,11 +50,7 @@ extern "C" {
} while (0)
static inline void *malloc16_clear( size_t size ) {
#ifdef __AVX2__
void *ptr = memalign(32, size+32);
#else
void *ptr = memalign(16, size+16);
#endif
DevAssert(ptr);
memset( ptr, 0, size );
return ptr;
......@@ -91,11 +83,7 @@ static inline void *malloc_or_fail(size_t size) {
# define msg(aRGS...) LOG_D(PHY, ##aRGS)
#endif
#ifndef malloc16
# ifdef __AVX2__
# define malloc16(x) memalign(32,x)
# else
# define malloc16(x) memalign(16,x)
# endif
#endif
#define free16(y,x) free(y)
......
......@@ -41,7 +41,7 @@
#include <executables/split_headers.h>
#include <openair1/PHY/CODING/coding_extern.h>
#include <threadPool/thread-pool.h>
#include <emmintrin.h>
#include "PHY/sse_intrin.h"
#define FS6_BUF_SIZE 1000*1000
static UDPsock_t sockFS6;
......
......@@ -75,7 +75,8 @@ int oai_exit = 0;
double cpuf;
THREAD_STRUCT thread_struct;
uint16_t sf_ahead=4;
extern uint16_t sf_ahead; // Bell Labs
//uint16_t sf_ahead=4;
//uint16_t slot_ahead=6;
int otg_enabled;
uint64_t downlink_frequency[MAX_NUM_CCs][4];
......@@ -725,12 +726,8 @@ void ocp_tx_rf(RU_t *ru, L1_rxtx_proc_t *proc) {
}
#if defined(__x86_64) || defined(__i386__)
#ifdef __AVX2__
sf_extension = (sf_extension)&0xfffffff8;
#else
sf_extension = (sf_extension)&0xfffffffc;
#endif
#elif defined(__arm__)
#elif defined(__arm__) || defined(__aarch64__)
sf_extension = (sf_extension)&0xfffffffc;
#endif
......@@ -1141,6 +1138,7 @@ int main ( int argc, char **argv ) {
int i;
int CC_id = 0;
int node_type = ngran_eNB;
sf_ahead=4; // Bell Labs
AssertFatal(load_configmodule(argc,argv,0), "[SOFTMODEM] Error, configuration module init failed\n");
logInit();
printf("Reading in command-line options\n");
......
......@@ -210,10 +210,11 @@ void set_softmodem_sighandler(void) {
act.sa_handler=signal_handler;
sigaction(SOFTMODEM_RTSIGNAL,&act,&oldact);
// Disabled in order generate a core dump for analysis with gdb
// Enable for clean exit on CTRL-C (i.e. record player, USRP...)
signal(SIGINT, signal_handler);
# if 0
printf("Send signal %d to display resource usage...\n",SIGRTMIN+1);
signal(SIGSEGV, signal_handler);
signal(SIGINT, signal_handler);
signal(SIGTERM, signal_handler);
signal(SIGABRT, signal_handler);
#endif
......
This diff is collapsed.
......@@ -153,7 +153,7 @@ void compute_gamma8(llr_t *m11,llr_t *m10,llr_t *systematic,channel_t *y_parity,
__m128i *y_parity128 = (__m128i *)y_parity;
__m128i *m10_128 = (__m128i *)m10;
__m128i *m11_128 = (__m128i *)m11;
#elif defined(__arm__)
#elif defined(__arm__) || defined(__aarch64__)
int8x16_t *systematic128 = (int8x16_t *)systematic;
int8x16_t *y_parity128 = (int8x16_t *)y_parity;
int8x16_t *m10_128 = (int8x16_t *)m10;
......@@ -177,7 +177,7 @@ void compute_gamma8(llr_t *m11,llr_t *m10,llr_t *systematic,channel_t *y_parity,
_mm_srai_epi16(_mm_adds_epi16(sh,yph),1));
m10_128[k] = _mm_packs_epi16(_mm_srai_epi16(_mm_subs_epi16(sl,ypl),1),
_mm_srai_epi16(_mm_subs_epi16(sh,yph),1));
#elif defined(__arm__)
#elif defined(__arm__) || defined(__aarch64__)
m11_128[k] = vhaddq_s8(systematic128[k],y_parity128[k]);
m10_128[k] = vhsubq_s8(systematic128[k],y_parity128[k]);
#endif
......@@ -193,7 +193,7 @@ void compute_gamma8(llr_t *m11,llr_t *m10,llr_t *systematic,channel_t *y_parity,
_mm_srai_epi16(_mm_adds_epi16(sh,yph),1));
m10_128[k] = _mm_packs_epi16(_mm_srai_epi16(_mm_subs_epi16(sl,ypl),1),
_mm_srai_epi16(_mm_subs_epi16(sh,yph),1));
#elif defined(__arm__)
#elif defined(__arm__) || defined(__aarch64__)
m11_128[k] = vhaddq_s8(systematic128[k+term_flag],y_parity128[k]);
m10_128[k] = vhsubq_s8(systematic128[k+term_flag],y_parity128[k]);
#endif
......@@ -209,7 +209,7 @@ void compute_alpha8(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,unsigned sh
__m128i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
__m128i new0,new1,new2,new3,new4,new5,new6,new7;
__m128i alpha_max;
#elif defined(__arm__)
#elif defined(__arm__) || defined(__aarch64__)
int8x16_t *alpha128=(int8x16_t *)alpha,*alpha_ptr;
int8x16_t *m11p,*m10p;
int8x16_t m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
......@@ -299,7 +299,7 @@ void compute_alpha8(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,unsigned sh
alpha[112] = -MAX8/2;
}
#elif defined(__arm__)
#elif defined(__arm__) || defined(__aarch64__)
alpha128[0] = vdupq_n_s8(-MAX8/2);
alpha128[0] = vsetq_lane_s8(0,alpha128[0],0);
alpha128[1] = vdupq_n_s8(-MAX8/2);
......@@ -401,7 +401,7 @@ void compute_beta8(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,unsigned sho
__m128i new0,new1,new2,new3,new4,new5,new6,new7;
__m128i *beta128,*alpha128,*beta_ptr;
__m128i beta_max;
#elif defined(__arm__)
#elif defined(__arm__) || defined(__aarch64__)
int8x16_t m11_128,m10_128;
int8x16_t m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
int8x16_t new0,new1,new2,new3,new4,new5,new6,new7;
......@@ -421,7 +421,7 @@ void compute_beta8(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,unsigned sho
#if defined(__x86_64__) || defined(__i386__)
beta_ptr = (__m128i *)&beta[frame_length<<3];
alpha128 = (__m128i *)&alpha[0];
#elif defined(__arm__)
#elif defined(__arm__) || defined(__aarch64__)
beta_ptr = (int8x16_t *)&beta[frame_length<<3];
alpha128 = (int8x16_t *)&alpha[0];
#endif
......@@ -451,7 +451,7 @@ void compute_beta8(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,unsigned sho
beta_ptr[5] = _mm_insert_epi8(beta_ptr[5],beta5,15);
beta_ptr[6] = _mm_insert_epi8(beta_ptr[6],beta6,15);
beta_ptr[7] = _mm_insert_epi8(beta_ptr[7],beta7,15);
#elif defined(__arm__)
#elif defined(__arm__) || defined(__aarch64__)
beta_ptr[0] = vsetq_lane_s8(beta0,beta_ptr[0],15);
beta_ptr[1] = vsetq_lane_s8(beta1,beta_ptr[1],15);
beta_ptr[2] = vsetq_lane_s8(beta2,beta_ptr[2],15);
......@@ -465,7 +465,7 @@ void compute_beta8(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,unsigned sho
#if defined(__x86_64__) || defined(__i386__)
beta_ptr = (__m128i *)&beta[frame_length<<3];
#elif defined(__arm__)
#elif defined(__arm__) || defined(__aarch64__)
beta_ptr = (int8x16_t *)&beta[frame_length<<3];
#endif
......@@ -515,7 +515,7 @@ void compute_beta8(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,unsigned sho
beta_ptr[5] = _mm_subs_epi8(beta_ptr[5],beta_max);
beta_ptr[6] = _mm_subs_epi8(beta_ptr[6],beta_max);
beta_ptr[7] = _mm_subs_epi8(beta_ptr[7],beta_max);
#elif defined(__arm__)
#elif defined(__arm__) || defined(__aarch64__)
m11_128=((int8x16_t *)m_11)[k];
m10_128=((int8x16_t *)m_10)[k];
m_b0 = vqaddq_s8(beta_ptr[4],m11_128); //m11
......@@ -575,7 +575,7 @@ void compute_beta8(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,unsigned sho
beta_ptr[5] = _mm_srli_si128(beta128[5],1);
beta_ptr[6] = _mm_srli_si128(beta128[6],1);
beta_ptr[7] = _mm_srli_si128(beta128[7],1);
#elif defined(__arm__)
#elif defined(__arm__) || defined(__aarch64__)
beta128 = (int8x16_t *)&beta[0];
beta_ptr = (int8x16_t *)&beta[frame_length<<3];
beta_ptr[0] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[0],8);
......@@ -608,7 +608,7 @@ void compute_ext8(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,llr_t *ext, l
__m128i m01_1,m01_2,m01_3,m01_4;
__m128i m10_1,m10_2,m10_3,m10_4;
__m128i m11_1,m11_2,m11_3,m11_4;
#elif defined(__arm__)
#elif defined(__arm__) || defined(__aarch64__)
int8x16_t *alpha128=(int8x16_t *)alpha;
int8x16_t *beta128=(int8x16_t *)beta;
int8x16_t *m11_128,*m10_128,*ext_128;
......@@ -670,7 +670,7 @@ void compute_ext8(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,llr_t *ext, l
*ext_128 = _mm_subs_epi8(m10_1,m01_1);
alpha_ptr+=8;
beta_ptr+=8;
#elif defined(__arm__)
#elif defined(__arm__) || defined(__aarch64__)
m11_128 = (int8x16_t *)&m_11[k<<4];
m10_128 = (int8x16_t *)&m_10[k<<4];
ext_128 = (int8x16_t *)&ext[k<<4];
......@@ -820,7 +820,7 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
__m128i *yp128;
__m128i tmp128[(n+8)>>3];
__m128i tmp={0}, zeros=_mm_setzero_si128();
#elif defined(__arm__)
#elif defined(__arm__) || defined(__aarch64__)
int8x16_t *yp128;
int8x16_t tmp128[(n+8)>>3];
int8x16_t tmp, zeros=vdupq_n_s8(0);
......@@ -900,7 +900,7 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
((__m128i *)y8)[i] = _mm_packs_epi16(_mm_srai_epi16(((__m128i *)y)[j],3),_mm_srai_epi16(((__m128i *)y)[j+1],4));
yp128 = (__m128i *)y8;
#elif defined(__arm__)
#elif defined(__arm__) || defined(__aarch64__)
int32x4_t avg=vdupq_n_s32(0);
for (i=0; i<(3*(n>>4))+1; i++) {
......@@ -1019,7 +1019,7 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],13);
tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],14);
((__m128i *)systematic2)[i]=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],15);
#elif defined(__arm__)
#elif defined(__arm__) || defined(__aarch64__)
tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,0);
tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,1);
tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,2);
......@@ -1067,7 +1067,7 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],15);
decoded_bytes_interl[i]=(uint16_t) _mm_movemask_epi8(_mm_cmpgt_epi8(tmp,zeros));
((__m128i *)systematic1)[i] = _mm_adds_epi8(_mm_subs_epi8(tmp,((__m128i *)ext)[i]),((__m128i *)systematic0)[i]);
#elif defined(__arm__)
#elif defined(__arm__) || defined(__aarch64__)
tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,0);
tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,1);
tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,2);
......@@ -1111,7 +1111,7 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],15);
tmp128[i] = _mm_adds_epi8(((__m128i *)ext2)[i],((__m128i *)systematic2)[i]);
((__m128i *)systematic1)[i] = _mm_adds_epi8(_mm_subs_epi8(tmp,((__m128i *)ext)[i]),((__m128i *)systematic0)[i]);
#elif defined(__arm__)
#elif defined(__arm__) || defined(__aarch64__)
tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,0);
tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,1);
tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,2);
......@@ -1166,7 +1166,7 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
}
}
#elif defined(__arm__)
#elif defined(__arm__) || defined(__aarch64__)
uint8x16_t *dbytes=(uint8x16_t *)decoded_bytes_interl;
uint16x8_t mask __attribute__((aligned(16)));
int n_128=n2>>7;
......@@ -1208,7 +1208,7 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
tmp=_mm_insert_epi8(tmp, ((llr_t *)tmp128)[*pi6_p++],8);
tmp=_mm_cmpgt_epi8(tmp,zeros);
((uint16_t *)decoded_bytes)[i]=(uint16_t)_mm_movemask_epi8(tmp);
#elif defined(__arm__)
#elif defined(__arm__) || defined(__aarch64__)
tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,7);
tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,6);
tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,5);
......@@ -1286,7 +1286,7 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
__m128i *ext_128=(__m128i *) ext;
__m128i *s1_128=(__m128i *) systematic1;
__m128i *s0_128=(__m128i *) systematic0;
#elif defined(__arm__)
#elif defined(__arm__) || defined(__aarch64__)
int8x16_t *ext_128=(int8x16_t *) ext;
int8x16_t *s1_128=(int8x16_t *) systematic1;
int8x16_t *s0_128=(int8x16_t *) systematic0;
......@@ -1296,7 +1296,7 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
for (i=0; i<myloop; i++) {
#if defined(__x86_64__) || defined(__i386__)
*ext_128=_mm_adds_epi8(_mm_subs_epi8(*ext_128,*s1_128++),*s0_128++);
#elif defined(__arm__)
#elif defined(__arm__) || defined(__aarch64__)
*ext_128=vqaddq_s8(vqsubq_s8(*ext_128,*s1_128++),*s0_128++);
#endif
ext_128++;
......
......@@ -36,11 +36,7 @@
#define MAX_BLOCK_LENGTH 8448
#ifndef malloc16
# ifdef __AVX2__
# define malloc16(x) memalign(32,x)
# else
# define malloc16(x) memalign(16,x)
# endif
#endif
#define NR_LDPC_PROFILER_DETAIL
......
......@@ -44,8 +44,7 @@
#include "crcext.h"
#include "types.h"
#include <immintrin.h>
#include <wmmintrin.h>
#include "PHY/sse_intrin.h"
/**
* PCLMULQDQ CRC computation context structure
......
......@@ -22,7 +22,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <immintrin.h>
#include "PHY/sse_intrin.h"
#include "../../nrLDPCdecoder_defs.h"
#include "../../nrLDPC_types.h"
......
......@@ -22,7 +22,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <immintrin.h>
#include "PHY/sse_intrin.h"
#include "../../nrLDPCdecoder_defs.h"
#include "../../nrLDPC_types.h"
......@@ -44,8 +44,8 @@ void nrLDPC_bnProcPc_BG2_generator_AVX512(const char *dir, int R)
abort();
}
// fprintf(fd,"#include <stdint.h>\n");
//fprintf(fd,"#include <immintrin.h>\n");
// fprintf(fd,"#include <stdint.h>\n");
// fprintf(fd,"#include \"PHY/sse_intrin.h\"\n");
fprintf(fd,"static inline void nrLDPC_bnProcPc_BG2_R%s_AVX512(int8_t* bnProcBuf,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) {\n",ratestr[R]);
const uint8_t* lut_numBnInBnGroups;
......
......@@ -22,7 +22,7 @@
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <immintrin.h>
#include "PHY/sse_intrin.h"
#include "../../nrLDPCdecoder_defs.h"
#include "../../nrLDPC_types.h"
......@@ -45,7 +45,7 @@ void nrLDPC_bnProc_BG1_generator_AVX512(const char *dir, int R)
}
//fprintf(fd,"#include <stdint.h>\n");
//fprintf(fd,"#include <immintrin.h>\n");
//fprintf(fd,"#include \"PHY/sse_intrin.h\"\n");
fprintf(fd,"static inline void nrLDPC_bnProc_BG1_R%s_AVX512(int8_t* bnProcBuf,int8_t* bnProcBufRes, int8_t* llrRes, uint16_t Z ) {\n", ratestr[R]);
......
......@@ -22,7 +22,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <immintrin.h>
#include "PHY/sse_intrin.h"
#include "../../nrLDPCdecoder_defs.h"
#include "../../nrLDPC_types.h"
......@@ -45,7 +45,7 @@ void nrLDPC_bnProc_BG2_generator_AVX512(const char *dir, int R)
}
fprintf(fd,"#include <stdint.h>\n");
fprintf(fd,"#include <immintrin.h>\n");
fprintf(fd,"#include \"PHY/sse_intrin.h\"\n");
fprintf(fd,"void nrLDPC_bnProc_BG2_R%s_AVX512(int8_t* bnProcBuf,int8_t* bnProcBufRes, int8_t* llrRes, uint16_t Z ) {\n",ratestr[R]);
const uint8_t* lut_numBnInBnGroups;
......
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -140,7 +140,7 @@ int ldpc_encoder_orig(unsigned char *test_input,unsigned char *channel_input,int
shift=5; // AVX2 - 256-bit SIMD
mask=31;
strcpy(data_type,"__m256i");
strcpy(xor_command,"_mm256_xor_si256");
strcpy(xor_command,"simde_mm256_xor_si256");
}
else if ((Zc&15)==0) {
shift=4; // SSE4 - 128-bit SIMD
......
This diff is collapsed.
......@@ -193,7 +193,7 @@ void phy_viterbi_dot11_sse2(char *y,unsigned char *decoded_bytes,unsigned short
__m128i *m0_ptr,*m1_ptr,*TB_ptr = &TB[offset<<2];
#elif defined(__arm__)
#elif defined(__arm__) || defined(__aarch64__)
uint8x16x2_t TB[2*4095*8]; // 2 int8x16_t per input bit, 8 bits / byte, 4095 is largest packet size in bytes
uint8x16_t even0_30a,even0_30b,even32_62a,even32_62b,odd1_31a,odd1_31b,odd33_63a,odd33_63b,TBeven0_30,TBeven32_62,TBodd1_31,TBodd33_63;
......@@ -224,7 +224,7 @@ void phy_viterbi_dot11_sse2(char *y,unsigned char *decoded_bytes,unsigned short
metrics48_63 = _mm_setzero_si128();
}
#elif defined(__arm__)
#elif defined(__arm__) || defined(__aarch64__)
if (offset == 0) {
// set initial metrics
......@@ -318,7 +318,7 @@ void phy_viterbi_dot11_sse2(char *y,unsigned char *decoded_bytes,unsigned short
metrics16_31 = _mm_subs_epu8(metrics16_31,min_state);
metrics32_47 = _mm_subs_epu8(metrics32_47,min_state);
metrics48_63 = _mm_subs_epu8(metrics48_63,min_state);
#elif defined(__arm__)
#elif defined(__arm__) || defined(__aarch64__)
m0_ptr = (uint8x16_t *)&m0_table[table_offset];
m1_ptr = (uint8x16_t *)&m1_table[table_offset];
......
......@@ -136,7 +136,7 @@ void phy_viterbi_lte_sse2(int8_t *y,uint8_t *decoded_bytes,uint16_t n)
__m128i min_state,min_state2;
#elif defined(__arm__)
#elif defined(__arm__) || defined(__aarch64__)
uint8x16x2_t TB[2*8192]; // 2 int8x16_t per input bit, 8 bits / byte, 8192 is largest packet size in bits
uint8x16_t even0_30a,even0_30b,even32_62a,even32_62b,odd1_31a,odd1_31b,odd33_63a,odd33_63b,TBeven0_30,TBeven32_62,TBodd1_31,TBodd33_63;
......@@ -165,7 +165,7 @@ void phy_viterbi_lte_sse2(int8_t *y,uint8_t *decoded_bytes,uint16_t n)
metrics16_31 = _mm_setzero_si128();
metrics32_47 = _mm_setzero_si128();
metrics48_63 = _mm_setzero_si128();
#elif defined(__arm__)
#elif defined(__arm__) || defined(__aarch64__)
metrics0_31.val[0] = vdupq_n_u8(0);
metrics0_31.val[1] = vdupq_n_u8(0);
metrics32_63.val[0] = vdupq_n_u8(0);
......@@ -259,7 +259,7 @@ void phy_viterbi_lte_sse2(int8_t *y,uint8_t *decoded_bytes,uint16_t n)
metrics16_31 = _mm_subs_epu8(metrics16_31,min_state);
metrics32_47 = _mm_subs_epu8(metrics32_47,min_state);
metrics48_63 = _mm_subs_epu8(metrics48_63,min_state);
#elif defined(__arm__)
#elif defined(__arm__) || defined(__aarch64__)
m0_ptr = (uint8x16_t *)&m0_table[table_offset];
m1_ptr = (uint8x16_t *)&m1_table[table_offset];
......@@ -353,7 +353,7 @@ void phy_viterbi_lte_sse2(int8_t *y,uint8_t *decoded_bytes,uint16_t n)
}
#elif defined(__arm__)
#elif defined(__arm__) || defined(__aarch64__)
for (s=0; s<16; s++)
if (((uint8_t *)&metrics0_31.val[0])[s] > maxm) {
maxm = ((uint8_t *)&metrics0_31.val[0])[s];
......
......@@ -661,11 +661,7 @@ int phy_init_nr_gNB(PHY_VARS_gNB *gNB,
int n_buf = Prx*max_ul_mimo_layers;
int nb_re_pusch = N_RB_UL * NR_NB_SC_PER_RB;
#ifdef __AVX2__
int nb_re_pusch2 = nb_re_pusch + (nb_re_pusch&7);
#else
int nb_re_pusch2 = nb_re_pusch;
#endif
for (int ULSCH_id=0; ULSCH_id<gNB->number_of_nr_ulsch_max; ULSCH_id++) {
pusch_vars[ULSCH_id] = (NR_gNB_PUSCH *)malloc16_clear( sizeof(NR_gNB_PUSCH) );
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -61,11 +61,7 @@
#define msg_nrt printf
//use msg_nrt in the non real-time context (for initialization, ...)
#ifndef malloc16
#ifdef __AVX2__
#define malloc16(x) memalign(32,x)
#else
#define malloc16(x) memalign(16,x)
#endif
#endif
#define free16(y,x) free(y)
#define bigmalloc malloc
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment