Commit 144e7a34 authored by Bruno Mongazon-Cazavet's avatar Bruno Mongazon-Cazavet

essage=Simde - 1636

parents d4b51059 d902d76d
...@@ -178,7 +178,6 @@ if (CUDA_FOUND) ...@@ -178,7 +178,6 @@ if (CUDA_FOUND)
# Disable warnings for CUDA # Disable warnings for CUDA
SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-lpthread;-w;-O3;--default-stream;per-thread;-I/usr/local/cuda/inc;-L/usr/local/cuda/lib -lcutil;-rdc=true;-lcudadevrt") SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-lpthread;-w;-O3;--default-stream;per-thread;-I/usr/local/cuda/inc;-L/usr/local/cuda/lib -lcutil;-rdc=true;-lcudadevrt")
SET(CUDA_VERBOSE_BUILD ON) SET(CUDA_VERBOSE_BUILD ON)
SET(CUDA_HOST_COMPILER "/usr/bin/g++")
SET(CUDA_SEPARABLE_COMPILATION ON) SET(CUDA_SEPARABLE_COMPILATION ON)
...@@ -198,44 +197,36 @@ message("CMAKE_BUILD_TYPE is ${CMAKE_BUILD_TYPE}") ...@@ -198,44 +197,36 @@ message("CMAKE_BUILD_TYPE is ${CMAKE_BUILD_TYPE}")
add_list_string_option(CMAKE_BUILD_TYPE "RelWithDebInfo" "Choose the type of build, options are: None(CMAKE_CXX_FLAGS or CMAKE_C_FLAGS used) Debug Release RelWithDebInfo MinSizeRel." Debug Release RelWithDebInfo MinSizeRel) add_list_string_option(CMAKE_BUILD_TYPE "RelWithDebInfo" "Choose the type of build, options are: None(CMAKE_CXX_FLAGS or CMAKE_C_FLAGS used) Debug Release RelWithDebInfo MinSizeRel." Debug Release RelWithDebInfo MinSizeRel)
Message("Architecture is ${CMAKE_SYSTEM_PROCESSOR}") Message("Architecture is ${CMAKE_SYSTEM_PROCESSOR}")
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7l") # in case /proc/cpuinfo exists we want to inspect available Intrinsics
set(C_FLAGS_PROCESSOR "-gdwarf-2 -mfloat-abi=hard -mfpu=neon -lgcc -lrt") # -so not to go always through SIMDE emulation
else (CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7l") # -so to avoid AVX512 instructions generation by gcc
if(EXISTS "/proc/cpuinfo") if(EXISTS "/proc/cpuinfo")
file(STRINGS "/proc/cpuinfo" CPUINFO REGEX flags LIMIT_COUNT 1) file(STRINGS "/proc/cpuinfo" CPUINFO REGEX flags LIMIT_COUNT 1)
message("NOAVX512 is ${NOAVX512}") message("AVX512 is ${AVX512}")
if (CPUINFO MATCHES "avx512bw" AND "${NOAVX512}" STREQUAL "False") message("AVX2 is ${AVX2}")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -mavx512bw -march=skylake-avx512 -mtune=skylake-avx512 " ) if ("${AVX512}" STREQUAL "False")
set(COMPILATION_AVX2 "True") set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -mno-avx512f -march=native -DSIMDE_X86_AVX2_NATIVE -DSIMDE_X86_AVX2_NATIVE -DSIMDE_X86_AVX512BW_NATIVE -DSIMDE_X86_AVX512F_NATIVE -DSIMDE_X86_AVX512VL_NATIVE -DSIMDE_X86_AVX_NATIVE -DSIMDE_X86_AVX_NATIVE -DSIMDE_X86_F16C_NATIVE -DSIMDE_X86_FMA_NATIVE -DSIMDE_X86_GFNI_NATIVE -DSIMDE_X86_MMX_NATIVE -DSIMDE_X86_PCLMUL_NATIVE -DSIMDE_X86_SSE2_NATIVE -DSIMDE_X86_SSE3_NATIVE -DSIMDE_X86_SSE4_1_NATIVE -DSIMDE_X86_SSE4_2_NATIVE -DSIMDE_X86_SSE_NATIVE -DSIMDE_X86_SSSE3_NATIVE -DSIMDE_X86_VPCLMULQDQ_NATIVE -DSIMDE_X86_XOP_HAVE_COM_ -DSIMDE_X86_XOP_NATIVE")
else() else()
if (CPUINFO MATCHES "avx2") set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -mavx512bw -march=skylake-avx512 -mtune=skylake-avx512")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -mavx2") endif()
set(COMPILATION_AVX2 "True") if (CPUINFO MATCHES "avx2" AND "${AVX2}" STREQUAL "True")
else() set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -DSIMDE_X86_AVX2_NATIVE -DSIMDE_X86_VPCLMULQDQ_NATIVE")
set(COMPILATION_AVX2 "False") endif()
endif() if (CPUINFO MATCHES "sse4_1")
if (CPUINFO MATCHES "sse4_1") set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -DSIMDE_X86_SSE4_1_NATIVE")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -msse4.1 -mpclmul") endif()
endif() if (CPUINFO MATCHES "sse4_2")
if (CPUINFO MATCHES "ssse3") set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -DSIMDE_X86_SSE4_2_NATIVE")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -mssse3") endif()
endif() if (CPUINFO MATCHES "ssse3")
endif() set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -DSIMDE_X86_SSSE3_NATIVE")
else() endif()
Message("/proc/cpuinfo does not exit. We will use manual CPU flags")
endif()
endif() endif()
set(C_FLAGS_PROCESSOR " ${C_FLAGS_PROCESSOR} ${CFLAGS_PROCESSOR_USER}") set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -fno-var-tracking-assignments -march=native")
Message("C_FLAGS_PROCESSOR is ${C_FLAGS_PROCESSOR}") Message("C_FLAGS_PROCESSOR is ${C_FLAGS_PROCESSOR}")
#if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86")
# if ( (NOT( C_FLAGS_PROCESSOR MATCHES "ssse3")) OR (NOT( C_FLAGS_PROCESSOR MATCHES "msse4.1")) )
# Message(FATAL_ERROR "For x86 Architecture, you must have following flags: -mssse3 -msse4.1. The current detected flags are: ${C_FLAGS_PROCESSOR}. You can pass the flags manually in build script, for example: ./build_oai --cflags_processor \"-mssse3 -msse4.1 -mavx2\" ")
# endif()
#endif()
# #
# add autotools definitions that were maybe used! # add autotools definitions that were maybe used!
...@@ -881,9 +872,6 @@ endif () ...@@ -881,9 +872,6 @@ endif ()
include_directories ("${OPENAIR_DIR}/sdr/COMMON") include_directories ("${OPENAIR_DIR}/sdr/COMMON")
Message("DEADLINE_SCHEDULER flag is ${DEADLINE_SCHEDULER}")
Message("CPU_Affinity flag is ${CPU_AFFINITY}")
############################################################## ##############################################################
# ???!!! TO BE DOCUMENTED OPTIONS !!!??? # ???!!! TO BE DOCUMENTED OPTIONS !!!???
############################################################## ##############################################################
...@@ -1738,14 +1726,8 @@ if (${SMBV}) ...@@ -1738,14 +1726,8 @@ if (${SMBV})
set(PHY_SRC "${PHY_SRC} ${OPENAIR1_DIR}/PHY/TOOLS/smbv.c") set(PHY_SRC "${PHY_SRC} ${OPENAIR1_DIR}/PHY/TOOLS/smbv.c")
endif (${SMBV}) endif (${SMBV})
if (${COMPILATION_AVX2} STREQUAL "True") set(PHY_SRC_UE ${PHY_SRC_UE} ${OPENAIR1_DIR}/PHY/LTE_UE_TRANSPORT/dlsch_llr_computation_avx2.c)
#set(PHY_SRC ${PHY_SRC} ${OPENAIR1_DIR}/PHY/LTE_TRANSPORT/dlsch_llr_computation_avx2.c) set(PHY_NR_UE_SRC ${PHY_NR_UE_SRC} ${OPENAIR1_DIR}/PHY/LTE_UE_TRANSPORT/dlsch_llr_computation_avx2.c)
set(PHY_SRC_UE ${PHY_SRC_UE} ${OPENAIR1_DIR}/PHY/LTE_UE_TRANSPORT/dlsch_llr_computation_avx2.c)
endif ()
if (${COMPILATION_AVX2} STREQUAL "True")
set(PHY_NR_UE_SRC ${PHY_NR_UE_SRC} ${OPENAIR1_DIR}/PHY/LTE_UE_TRANSPORT/dlsch_llr_computation_avx2.c)
endif ()
add_library(PHY_COMMON ${PHY_SRC_COMMON}) add_library(PHY_COMMON ${PHY_SRC_COMMON})
add_dependencies(PHY_COMMON rrc_flag) add_dependencies(PHY_COMMON rrc_flag)
...@@ -2857,16 +2839,14 @@ add_executable(nr-uesoftmodem ...@@ -2857,16 +2839,14 @@ add_executable(nr-uesoftmodem
target_link_libraries (nr-uesoftmodem target_link_libraries (nr-uesoftmodem
-Wl,--start-group -Wl,--start-group
RRC_LIB NR_RRC_LIB NGAP_LIB NGAP_GNB SECU_CN SECU_OSA UTIL HASHTABLE SCTP_CLIENT SCHED_RU_LIB SCHED_UE_LIB SCHED_NR_UE_LIB NR_RRC_LIB SECU_CN SECU_OSA UTIL HASHTABLE SCHED_RU_LIB SCHED_NR_UE_LIB
PHY_COMMON PHY_NR_COMMON PHY_UE PHY_NR_UE PHY_RU NR_L2_UE L2_UE_LTE_NR MAC_NR_COMMON NFAPI_COMMON_LIB NFAPI_LIB NFAPI_PNF_LIB PHY_COMMON PHY_NR_COMMON PHY_NR_UE NR_L2_UE L2_UE_LTE_NR MAC_NR_COMMON NFAPI_COMMON_LIB NFAPI_LIB NFAPI_PNF_LIB
NFAPI_USER_LIB MISC_NFAPI_NR_LIB S1AP_LIB S1AP_ENB NFAPI_USER_LIB MISC_NFAPI_NR_LIB
${RAL_LIB} ${NAS_UE_LIB} ITTI ${FLPT_MSG_LIB} ${ATLAS_LIBRARIES} ${RAL_LIB} ITTI ${ATLAS_LIBRARIES} LIB_5GNAS_GNB LIB_NAS_SIMUE ${NAS_SIM_LIB}
NFAPI_USER_LIB S1AP_LIB S1AP_ENB
${RAL_LIB} ${NAS_UE_LIB} ITTI ${FLPT_MSG_LIB} ${ATLAS_LIBRARIES} LIB_5GNAS_GNB LIB_NAS_SIMUE ${NAS_SIM_LIB}
-Wl,--end-group z dl) -Wl,--end-group z dl)
target_link_libraries (nr-uesoftmodem ${LIBXML2_LIBRARIES}) target_link_libraries (nr-uesoftmodem ${LIBXML2_LIBRARIES})
target_link_libraries (nr-uesoftmodem pthread m CONFIG_LIB rt crypt ${CRYPTO_LIBRARIES} ${OPENSSL_LIBRARIES} sctp ${XFORMS_LIBRARIES} ${CMAKE_DL_LIBS} ${LIBYAML_LIBRARIES} ${ATLAS_LIBRARIES}) target_link_libraries (nr-uesoftmodem pthread m CONFIG_LIB rt crypt ${CRYPTO_LIBRARIES} ${OPENSSL_LIBRARIES} ${XFORMS_LIBRARIES} ${CMAKE_DL_LIBS} ${LIBYAML_LIBRARIES} ${ATLAS_LIBRARIES})
target_link_libraries (nr-uesoftmodem ${LIB_LMS_LIBRARIES}) target_link_libraries (nr-uesoftmodem ${LIB_LMS_LIBRARIES})
target_link_libraries (nr-uesoftmodem ${T_LIB}) target_link_libraries (nr-uesoftmodem ${T_LIB})
...@@ -3154,7 +3134,7 @@ function(make_driver name dir) ...@@ -3154,7 +3134,7 @@ function(make_driver name dir)
endforeach() endforeach()
CONFIGURE_FILE(${OPENAIR_CMAKE}/tools/Kbuild.cmake ${OPENAIR_BIN_DIR}/${name}/Kbuild) CONFIGURE_FILE(${OPENAIR_CMAKE}/tools/Kbuild.cmake ${OPENAIR_BIN_DIR}/${name}/Kbuild)
add_custom_command(OUTPUT ${name}.ko add_custom_command(OUTPUT ${name}.ko
COMMAND make -j2 -C ${module_build_path} M=${OPENAIR_BIN_DIR}/${name} COMMAND make -C ${module_build_path} M=${OPENAIR_BIN_DIR}/${name}
WORKING_DIRECTORY ${OPENAIR_BIN_DIR}/${name} WORKING_DIRECTORY ${OPENAIR_BIN_DIR}/${name}
COMMENT "building ${module}.ko" COMMENT "building ${module}.ko"
VERBATIM VERBATIM
......
...@@ -177,7 +177,7 @@ function variant__v1__enb_usrp { ...@@ -177,7 +177,7 @@ function variant__v1__enb_usrp {
function variant__v2__basic_sim { function variant__v2__basic_sim {
NB_PATTERN_FILES=11 NB_PATTERN_FILES=11
BUILD_OPTIONS="--eNB --UE" BUILD_OPTIONS="--eNB --UE --noavx512"
VM_MEMORY=8192 VM_MEMORY=8192
RUN_OPTIONS="complex" RUN_OPTIONS="complex"
} }
...@@ -201,28 +201,28 @@ function variant__v5__gnb_usrp { ...@@ -201,28 +201,28 @@ function variant__v5__gnb_usrp {
VM_MEMORY=10240 VM_MEMORY=10240
VM_CPU=8 VM_CPU=8
NB_PATTERN_FILES=6 NB_PATTERN_FILES=6
BUILD_OPTIONS="--gNB -w USRP" BUILD_OPTIONS="--gNB -w USRP --noavx512"
} }
function variant__v6__nr_ue_usrp { function variant__v6__nr_ue_usrp {
VM_MEMORY=4096 VM_MEMORY=4096
VM_CPU=4 VM_CPU=4
NB_PATTERN_FILES=6 NB_PATTERN_FILES=6
BUILD_OPTIONS="--nrUE -w USRP" BUILD_OPTIONS="--nrUE -w USRP --noavx512"
} }
function variant__v7__enb_ethernet { function variant__v7__enb_ethernet {
VM_MEMORY=4096 VM_MEMORY=4096
ARCHIVES_LOC=enb_eth ARCHIVES_LOC=enb_eth
NB_PATTERN_FILES=6 NB_PATTERN_FILES=6
BUILD_OPTIONS="--eNB -w USRP" BUILD_OPTIONS="--eNB -w USRP --noavx512"
} }
function variant__v8__ue_ethernet { function variant__v8__ue_ethernet {
VM_MEMORY=4096 VM_MEMORY=4096
ARCHIVES_LOC=ue_eth ARCHIVES_LOC=ue_eth
NB_PATTERN_FILES=10 NB_PATTERN_FILES=10
BUILD_OPTIONS="--UE -w USRP" BUILD_OPTIONS="--UE -w USRP --noavx512"
} }
function variant__v10__flexran_rtc { function variant__v10__flexran_rtc {
......
...@@ -34,7 +34,7 @@ ...@@ -34,7 +34,7 @@
<mode>TesteNB</mode> <mode>TesteNB</mode>
<class>Build_eNB</class> <class>Build_eNB</class>
<desc>Build gNB (USRP)</desc> <desc>Build gNB (USRP)</desc>
<Build_eNB_args>--gNB -w USRP --ninja --cmake-opt -DBoost_INCLUDE_DIR=/usr/include/boost169 --noavx512</Build_eNB_args> <Build_eNB_args>--gNB -w USRP --ninja --cmake-opt -DBoost_INCLUDE_DIR=/usr/include/boost169 --cmake-opt -DCUDA_HOST_COMPILER=/opt/rh/devtoolset-7/root/usr/bin/gcc -c -P</Build_eNB_args>
<forced_workspace_cleanup>True</forced_workspace_cleanup> <forced_workspace_cleanup>True</forced_workspace_cleanup>
</testCase> </testCase>
......
...@@ -55,7 +55,8 @@ BUILD_COVERITY_SCAN=0 ...@@ -55,7 +55,8 @@ BUILD_COVERITY_SCAN=0
DISABLE_HARDWARE_DEPENDENCY="False" DISABLE_HARDWARE_DEPENDENCY="False"
CMAKE_BUILD_TYPE="RelWithDebInfo" CMAKE_BUILD_TYPE="RelWithDebInfo"
CMAKE_CMD="$CMAKE" CMAKE_CMD="$CMAKE"
NOAVX512="False" AVX512="True"
AVX2="True"
BUILD_ECLIPSE=0 BUILD_ECLIPSE=0
NR="False" NR="False"
OPTIONAL_LIBRARIES="telnetsrv enbscope uescope nrscope" OPTIONAL_LIBRARIES="telnetsrv enbscope uescope nrscope"
...@@ -127,11 +128,9 @@ Options: ...@@ -127,11 +128,9 @@ Options:
-x | --xforms -x | --xforms
Will compile with software oscilloscope features Will compile with software oscilloscope features
--verbose-ci --verbose-ci
Compile with verbose instructions in CI Docker env Compile with verbose instructions in CI Docker env
--verbose-compile --verbose-compile
Shows detailed compilation instructions in makefile Shows detailed compilation instructions in makefile
--cflags_processor
Manually Add CFLAGS of processor if they are not detected correctly by script. Only add these flags if you know your processor supports them. Example flags: -msse3 -msse4.1 -msse4.2 -mavx2
--build-doxygen --build-doxygen
Builds doxygen based documentation. Builds doxygen based documentation.
--build-coverity-scan --build-coverity-scan
...@@ -159,10 +158,10 @@ Options: ...@@ -159,10 +158,10 @@ Options:
Build eclipse project files. Build eclipse project files.
--build-lib <libraries> --build-lib <libraries>
Build optional shared library, <libraries> can be one or several of $OPTIONAL_LIBRARIES or \"all\" Build optional shared library, <libraries> can be one or several of $OPTIONAL_LIBRARIES or \"all\"
--usrp-recplay
Build for I/Q record-playback modes
--noavx512 --noavx512
Build without AVX512 if it is present on CPU Disable AVX512 intrinsics whatever processor capability is
--noavx2
Disable AVX2 intrinsics if processor supports it or use SIMDE emulation
-k | --skip-shared-libraries -k | --skip-shared-libraries
Skip build for shared libraries to reduce compilation time when building frequently for debugging purposes Skip build for shared libraries to reduce compilation time when building frequently for debugging purposes
--ninja --ninja
...@@ -349,10 +348,6 @@ function main() { ...@@ -349,10 +348,6 @@ function main() {
VERBOSE_COMPILE=1 VERBOSE_COMPILE=1
echo_info "Will compile with verbose instructions" echo_info "Will compile with verbose instructions"
shift;; shift;;
--cflags_processor)
CMAKE_CMD="$CMAKE_CMD -DCFLAGS_PROCESSOR_USER=\"$2\""
echo_info "Setting CPU FLAGS from USER to: $2"
shift 2;;
--build-doxygen) --build-doxygen)
BUILD_DOXYGEN=1 BUILD_DOXYGEN=1
echo_info "Will build doxygen support" echo_info "Will build doxygen support"
...@@ -430,8 +425,12 @@ function main() { ...@@ -430,8 +425,12 @@ function main() {
fi fi
shift 2;; shift 2;;
--noavx512) --noavx512)
NOAVX512="True" AVX512="False"
echo_info "Disabling AVX512" echo_info "Disabling AVX512 instructions"
shift 1;;
--noavx2)
AVX2="False"
echo_info "Disabling AVX2 instructions"
shift 1;; shift 1;;
-k | --skip-shared-libraries) -k | --skip-shared-libraries)
SKIP_SHARED_LIB_FLAG="True" SKIP_SHARED_LIB_FLAG="True"
...@@ -616,7 +615,7 @@ function main() { ...@@ -616,7 +615,7 @@ function main() {
if [[ ${#CMAKE_CXX_FLAGS[@]} > 0 ]]; then CMAKE_CMD="$CMAKE_CMD -DCMAKE_CXX_FLAGS=\"${CMAKE_CXX_FLAGS[*]}\""; fi if [[ ${#CMAKE_CXX_FLAGS[@]} > 0 ]]; then CMAKE_CMD="$CMAKE_CMD -DCMAKE_CXX_FLAGS=\"${CMAKE_CXX_FLAGS[*]}\""; fi
# for historical reasons we build in a subdirectory cmake_targets/XYZ/build, # for historical reasons we build in a subdirectory cmake_targets/XYZ/build,
# e.g., cmake_targets/ran_build/build, hence the ../../.. # e.g., cmake_targets/ran_build/build, hence the ../../..
CMAKE_CMD="$CMAKE_CMD -DNOAVX512=\"${NOAVX512[*]}\" ../../.." CMAKE_CMD="$CMAKE_CMD -DAVX512=\"${AVX512[*]}\" -DAVX2=\"${AVX2[*]}\" ../../.."
echo_info "running $CMAKE_CMD" echo_info "running $CMAKE_CMD"
eval $CMAKE_CMD eval $CMAKE_CMD
......
...@@ -122,6 +122,7 @@ check_supported_distribution() { ...@@ -122,6 +122,7 @@ check_supported_distribution() {
"rhel8.6") return 0 ;; "rhel8.6") return 0 ;;
"rhel8.7") return 0 ;; "rhel8.7") return 0 ;;
"centos7") return 0 ;; "centos7") return 0 ;;
"centos8") return 0 ;;
esac esac
return 1 return 1
} }
...@@ -897,6 +898,7 @@ check_install_oai_software() { ...@@ -897,6 +898,7 @@ check_install_oai_software() {
fi fi
install_asn1c_from_source $1 install_asn1c_from_source $1
install_simde_from_source $1
} }
install_asn1c_from_source(){ install_asn1c_from_source(){
...@@ -923,6 +925,37 @@ install_asn1c_from_source(){ ...@@ -923,6 +925,37 @@ install_asn1c_from_source(){
) > $asn1_install_log 2>&1 ) > $asn1_install_log 2>&1
} }
install_simde_from_source(){
echo_info "\nInstalling SIMDE from source without test cases (header files only)"
cd /tmp
$SUDO rm -rf /tmp/simde
git clone https://github.com/simd-everywhere/simde-no-tests.git /tmp/simde
cd /tmp/simde
# brute force copy into /usr/include
$SUDO \cp -rv ../simde /usr/include
$SUDO patch /usr/include/simde/x86/avx.h << FIN
--- /usr/include/simde/x86/avx.h.old 2022-10-03 19:17:39.828223432 +0200
+++ /usr/include/simde/x86/avx.h 2022-10-05 16:19:55.086019445 +0200
@@ -3636,12 +3636,16 @@
SIMDE_FUNCTION_ATTRIBUTES
simde__m256i simde_mm256_insertf128_si256(simde__m256i a, simde__m128i b, int imm8)
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) {
+#if defined(SIMDE_X86_AVX_NATIVE)
+ return _mm256_insertf128_si256(a, b, imm8);
+#else
simde__m256i_private a_ = simde__m256i_to_private(a);
simde__m128i_private b_ = simde__m128i_to_private(b);
a_.m128i_private[imm8] = b_;
return simde__m256i_from_private(a_);
+#endif
}
#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
#undef _mm256_insertf128_si256
FIN
}
################################################# #################################################
# 2. compile # 2. compile
################################################ ################################################
......
...@@ -37,7 +37,7 @@ extern double cpu_freq_GHz __attribute__ ((aligned(32)));; ...@@ -37,7 +37,7 @@ extern double cpu_freq_GHz __attribute__ ((aligned(32)));;
// structure to store data to compute cpu measurment // structure to store data to compute cpu measurment
#if defined(__x86_64__) || defined(__i386__) #if defined(__x86_64__) || defined(__i386__)
typedef long long oai_cputime_t; typedef long long oai_cputime_t;
#elif defined(__arm__) #elif defined(__arm__) || defined(__aarch64__)
typedef uint32_t oai_cputime_t; typedef uint32_t oai_cputime_t;
#else #else
#error "building on unsupported CPU architecture" #error "building on unsupported CPU architecture"
...@@ -107,7 +107,7 @@ static inline unsigned long long rdtsc_oai(void) { ...@@ -107,7 +107,7 @@ static inline unsigned long long rdtsc_oai(void) {
return (d<<32) | a; return (d<<32) | a;
} }
#elif defined(__arm__) #elif defined(__arm__) || defined(__aarch64__)
static inline uint32_t rdtsc_oai(void) __attribute__((always_inline)); static inline uint32_t rdtsc_oai(void) __attribute__((always_inline));
static inline uint32_t rdtsc_oai(void) { static inline uint32_t rdtsc_oai(void) {
uint32_t r = 0; uint32_t r = 0;
......
...@@ -34,11 +34,7 @@ extern "C" { ...@@ -34,11 +34,7 @@ extern "C" {
#endif #endif
#ifndef malloc16 #ifndef malloc16
# ifdef __AVX2__
# define malloc16(x) memalign(32,x+32) # define malloc16(x) memalign(32,x+32)
# else
# define malloc16(x) memalign(16,x+16)
# endif
#endif #endif
#define free16(y,x) free(y) #define free16(y,x) free(y)
#define bigmalloc malloc #define bigmalloc malloc
...@@ -54,11 +50,7 @@ extern "C" { ...@@ -54,11 +50,7 @@ extern "C" {
} while (0) } while (0)
static inline void *malloc16_clear( size_t size ) { static inline void *malloc16_clear( size_t size ) {
#ifdef __AVX2__
void *ptr = memalign(32, size+32); void *ptr = memalign(32, size+32);
#else
void *ptr = memalign(16, size+16);
#endif
DevAssert(ptr); DevAssert(ptr);
memset( ptr, 0, size ); memset( ptr, 0, size );
return ptr; return ptr;
...@@ -91,11 +83,7 @@ static inline void *malloc_or_fail(size_t size) { ...@@ -91,11 +83,7 @@ static inline void *malloc_or_fail(size_t size) {
# define msg(aRGS...) LOG_D(PHY, ##aRGS) # define msg(aRGS...) LOG_D(PHY, ##aRGS)
#endif #endif
#ifndef malloc16 #ifndef malloc16
# ifdef __AVX2__
# define malloc16(x) memalign(32,x) # define malloc16(x) memalign(32,x)
# else
# define malloc16(x) memalign(16,x)
# endif
#endif #endif
#define free16(y,x) free(y) #define free16(y,x) free(y)
......
...@@ -41,7 +41,7 @@ ...@@ -41,7 +41,7 @@
#include <executables/split_headers.h> #include <executables/split_headers.h>
#include <openair1/PHY/CODING/coding_extern.h> #include <openair1/PHY/CODING/coding_extern.h>
#include <threadPool/thread-pool.h> #include <threadPool/thread-pool.h>
#include <emmintrin.h> #include "PHY/sse_intrin.h"
#define FS6_BUF_SIZE 1000*1000 #define FS6_BUF_SIZE 1000*1000
static UDPsock_t sockFS6; static UDPsock_t sockFS6;
......
...@@ -75,7 +75,8 @@ int oai_exit = 0; ...@@ -75,7 +75,8 @@ int oai_exit = 0;
double cpuf; double cpuf;
THREAD_STRUCT thread_struct; THREAD_STRUCT thread_struct;
uint16_t sf_ahead=4; extern uint16_t sf_ahead; // Bell Labs
//uint16_t sf_ahead=4;
//uint16_t slot_ahead=6; //uint16_t slot_ahead=6;
int otg_enabled; int otg_enabled;
uint64_t downlink_frequency[MAX_NUM_CCs][4]; uint64_t downlink_frequency[MAX_NUM_CCs][4];
...@@ -725,12 +726,8 @@ void ocp_tx_rf(RU_t *ru, L1_rxtx_proc_t *proc) { ...@@ -725,12 +726,8 @@ void ocp_tx_rf(RU_t *ru, L1_rxtx_proc_t *proc) {
} }
#if defined(__x86_64) || defined(__i386__) #if defined(__x86_64) || defined(__i386__)
#ifdef __AVX2__
sf_extension = (sf_extension)&0xfffffff8; sf_extension = (sf_extension)&0xfffffff8;
#else #elif defined(__arm__) || defined(__aarch64__)
sf_extension = (sf_extension)&0xfffffffc;
#endif
#elif defined(__arm__)
sf_extension = (sf_extension)&0xfffffffc; sf_extension = (sf_extension)&0xfffffffc;
#endif #endif
...@@ -1141,6 +1138,7 @@ int main ( int argc, char **argv ) { ...@@ -1141,6 +1138,7 @@ int main ( int argc, char **argv ) {
int i; int i;
int CC_id = 0; int CC_id = 0;
int node_type = ngran_eNB; int node_type = ngran_eNB;
sf_ahead=4; // Bell Labs
AssertFatal(load_configmodule(argc,argv,0), "[SOFTMODEM] Error, configuration module init failed\n"); AssertFatal(load_configmodule(argc,argv,0), "[SOFTMODEM] Error, configuration module init failed\n");
logInit(); logInit();
printf("Reading in command-line options\n"); printf("Reading in command-line options\n");
......
...@@ -210,10 +210,11 @@ void set_softmodem_sighandler(void) { ...@@ -210,10 +210,11 @@ void set_softmodem_sighandler(void) {
act.sa_handler=signal_handler; act.sa_handler=signal_handler;
sigaction(SOFTMODEM_RTSIGNAL,&act,&oldact); sigaction(SOFTMODEM_RTSIGNAL,&act,&oldact);
// Disabled in order generate a core dump for analysis with gdb // Disabled in order generate a core dump for analysis with gdb
// Enable for clean exit on CTRL-C (i.e. record player, USRP...)
signal(SIGINT, signal_handler);
# if 0 # if 0
printf("Send signal %d to display resource usage...\n",SIGRTMIN+1); printf("Send signal %d to display resource usage...\n",SIGRTMIN+1);
signal(SIGSEGV, signal_handler); signal(SIGSEGV, signal_handler);
signal(SIGINT, signal_handler);
signal(SIGTERM, signal_handler); signal(SIGTERM, signal_handler);
signal(SIGABRT, signal_handler); signal(SIGABRT, signal_handler);
#endif #endif
......
This diff is collapsed.
...@@ -153,7 +153,7 @@ void compute_gamma8(llr_t *m11,llr_t *m10,llr_t *systematic,channel_t *y_parity, ...@@ -153,7 +153,7 @@ void compute_gamma8(llr_t *m11,llr_t *m10,llr_t *systematic,channel_t *y_parity,
__m128i *y_parity128 = (__m128i *)y_parity; __m128i *y_parity128 = (__m128i *)y_parity;
__m128i *m10_128 = (__m128i *)m10; __m128i *m10_128 = (__m128i *)m10;
__m128i *m11_128 = (__m128i *)m11; __m128i *m11_128 = (__m128i *)m11;
#elif defined(__arm__) #elif defined(__arm__) || defined(__aarch64__)
int8x16_t *systematic128 = (int8x16_t *)systematic; int8x16_t *systematic128 = (int8x16_t *)systematic;
int8x16_t *y_parity128 = (int8x16_t *)y_parity; int8x16_t *y_parity128 = (int8x16_t *)y_parity;
int8x16_t *m10_128 = (int8x16_t *)m10; int8x16_t *m10_128 = (int8x16_t *)m10;
...@@ -177,7 +177,7 @@ void compute_gamma8(llr_t *m11,llr_t *m10,llr_t *systematic,channel_t *y_parity, ...@@ -177,7 +177,7 @@ void compute_gamma8(llr_t *m11,llr_t *m10,llr_t *systematic,channel_t *y_parity,
_mm_srai_epi16(_mm_adds_epi16(sh,yph),1)); _mm_srai_epi16(_mm_adds_epi16(sh,yph),1));
m10_128[k] = _mm_packs_epi16(_mm_srai_epi16(_mm_subs_epi16(sl,ypl),1), m10_128[k] = _mm_packs_epi16(_mm_srai_epi16(_mm_subs_epi16(sl,ypl),1),
_mm_srai_epi16(_mm_subs_epi16(sh,yph),1)); _mm_srai_epi16(_mm_subs_epi16(sh,yph),1));
#elif defined(__arm__) #elif defined(__arm__) || defined(__aarch64__)
m11_128[k] = vhaddq_s8(systematic128[k],y_parity128[k]); m11_128[k] = vhaddq_s8(systematic128[k],y_parity128[k]);
m10_128[k] = vhsubq_s8(systematic128[k],y_parity128[k]); m10_128[k] = vhsubq_s8(systematic128[k],y_parity128[k]);
#endif #endif
...@@ -193,7 +193,7 @@ void compute_gamma8(llr_t *m11,llr_t *m10,llr_t *systematic,channel_t *y_parity, ...@@ -193,7 +193,7 @@ void compute_gamma8(llr_t *m11,llr_t *m10,llr_t *systematic,channel_t *y_parity,
_mm_srai_epi16(_mm_adds_epi16(sh,yph),1)); _mm_srai_epi16(_mm_adds_epi16(sh,yph),1));
m10_128[k] = _mm_packs_epi16(_mm_srai_epi16(_mm_subs_epi16(sl,ypl),1), m10_128[k] = _mm_packs_epi16(_mm_srai_epi16(_mm_subs_epi16(sl,ypl),1),
_mm_srai_epi16(_mm_subs_epi16(sh,yph),1)); _mm_srai_epi16(_mm_subs_epi16(sh,yph),1));
#elif defined(__arm__) #elif defined(__arm__) || defined(__aarch64__)
m11_128[k] = vhaddq_s8(systematic128[k+term_flag],y_parity128[k]); m11_128[k] = vhaddq_s8(systematic128[k+term_flag],y_parity128[k]);
m10_128[k] = vhsubq_s8(systematic128[k+term_flag],y_parity128[k]); m10_128[k] = vhsubq_s8(systematic128[k+term_flag],y_parity128[k]);
#endif #endif
...@@ -209,7 +209,7 @@ void compute_alpha8(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,unsigned sh ...@@ -209,7 +209,7 @@ void compute_alpha8(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,unsigned sh
__m128i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7; __m128i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
__m128i new0,new1,new2,new3,new4,new5,new6,new7; __m128i new0,new1,new2,new3,new4,new5,new6,new7;
__m128i alpha_max; __m128i alpha_max;
#elif defined(__arm__) #elif defined(__arm__) || defined(__aarch64__)
int8x16_t *alpha128=(int8x16_t *)alpha,*alpha_ptr; int8x16_t *alpha128=(int8x16_t *)alpha,*alpha_ptr;
int8x16_t *m11p,*m10p; int8x16_t *m11p,*m10p;
int8x16_t m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7; int8x16_t m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
...@@ -299,7 +299,7 @@ void compute_alpha8(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,unsigned sh ...@@ -299,7 +299,7 @@ void compute_alpha8(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,unsigned sh
alpha[112] = -MAX8/2; alpha[112] = -MAX8/2;
} }
#elif defined(__arm__) #elif defined(__arm__) || defined(__aarch64__)
alpha128[0] = vdupq_n_s8(-MAX8/2); alpha128[0] = vdupq_n_s8(-MAX8/2);
alpha128[0] = vsetq_lane_s8(0,alpha128[0],0); alpha128[0] = vsetq_lane_s8(0,alpha128[0],0);
alpha128[1] = vdupq_n_s8(-MAX8/2); alpha128[1] = vdupq_n_s8(-MAX8/2);
...@@ -401,7 +401,7 @@ void compute_beta8(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,unsigned sho ...@@ -401,7 +401,7 @@ void compute_beta8(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,unsigned sho
__m128i new0,new1,new2,new3,new4,new5,new6,new7; __m128i new0,new1,new2,new3,new4,new5,new6,new7;
__m128i *beta128,*alpha128,*beta_ptr; __m128i *beta128,*alpha128,*beta_ptr;
__m128i beta_max; __m128i beta_max;
#elif defined(__arm__) #elif defined(__arm__) || defined(__aarch64__)
int8x16_t m11_128,m10_128; int8x16_t m11_128,m10_128;
int8x16_t m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7; int8x16_t m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
int8x16_t new0,new1,new2,new3,new4,new5,new6,new7; int8x16_t new0,new1,new2,new3,new4,new5,new6,new7;
...@@ -421,7 +421,7 @@ void compute_beta8(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,unsigned sho ...@@ -421,7 +421,7 @@ void compute_beta8(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,unsigned sho
#if defined(__x86_64__) || defined(__i386__) #if defined(__x86_64__) || defined(__i386__)
beta_ptr = (__m128i *)&beta[frame_length<<3]; beta_ptr = (__m128i *)&beta[frame_length<<3];
alpha128 = (__m128i *)&alpha[0]; alpha128 = (__m128i *)&alpha[0];
#elif defined(__arm__) #elif defined(__arm__) || defined(__aarch64__)
beta_ptr = (int8x16_t *)&beta[frame_length<<3]; beta_ptr = (int8x16_t *)&beta[frame_length<<3];
alpha128 = (int8x16_t *)&alpha[0]; alpha128 = (int8x16_t *)&alpha[0];
#endif #endif
...@@ -451,7 +451,7 @@ void compute_beta8(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,unsigned sho ...@@ -451,7 +451,7 @@ void compute_beta8(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,unsigned sho
beta_ptr[5] = _mm_insert_epi8(beta_ptr[5],beta5,15); beta_ptr[5] = _mm_insert_epi8(beta_ptr[5],beta5,15);
beta_ptr[6] = _mm_insert_epi8(beta_ptr[6],beta6,15); beta_ptr[6] = _mm_insert_epi8(beta_ptr[6],beta6,15);
beta_ptr[7] = _mm_insert_epi8(beta_ptr[7],beta7,15); beta_ptr[7] = _mm_insert_epi8(beta_ptr[7],beta7,15);
#elif defined(__arm__) #elif defined(__arm__) || defined(__aarch64__)
beta_ptr[0] = vsetq_lane_s8(beta0,beta_ptr[0],15); beta_ptr[0] = vsetq_lane_s8(beta0,beta_ptr[0],15);
beta_ptr[1] = vsetq_lane_s8(beta1,beta_ptr[1],15); beta_ptr[1] = vsetq_lane_s8(beta1,beta_ptr[1],15);
beta_ptr[2] = vsetq_lane_s8(beta2,beta_ptr[2],15); beta_ptr[2] = vsetq_lane_s8(beta2,beta_ptr[2],15);
...@@ -465,7 +465,7 @@ void compute_beta8(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,unsigned sho ...@@ -465,7 +465,7 @@ void compute_beta8(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,unsigned sho
#if defined(__x86_64__) || defined(__i386__) #if defined(__x86_64__) || defined(__i386__)
beta_ptr = (__m128i *)&beta[frame_length<<3]; beta_ptr = (__m128i *)&beta[frame_length<<3];
#elif defined(__arm__) #elif defined(__arm__) || defined(__aarch64__)
beta_ptr = (int8x16_t *)&beta[frame_length<<3]; beta_ptr = (int8x16_t *)&beta[frame_length<<3];
#endif #endif
...@@ -515,7 +515,7 @@ void compute_beta8(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,unsigned sho ...@@ -515,7 +515,7 @@ void compute_beta8(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,unsigned sho
beta_ptr[5] = _mm_subs_epi8(beta_ptr[5],beta_max); beta_ptr[5] = _mm_subs_epi8(beta_ptr[5],beta_max);
beta_ptr[6] = _mm_subs_epi8(beta_ptr[6],beta_max); beta_ptr[6] = _mm_subs_epi8(beta_ptr[6],beta_max);
beta_ptr[7] = _mm_subs_epi8(beta_ptr[7],beta_max); beta_ptr[7] = _mm_subs_epi8(beta_ptr[7],beta_max);
#elif defined(__arm__) #elif defined(__arm__) || defined(__aarch64__)
m11_128=((int8x16_t *)m_11)[k]; m11_128=((int8x16_t *)m_11)[k];
m10_128=((int8x16_t *)m_10)[k]; m10_128=((int8x16_t *)m_10)[k];
m_b0 = vqaddq_s8(beta_ptr[4],m11_128); //m11 m_b0 = vqaddq_s8(beta_ptr[4],m11_128); //m11
...@@ -575,7 +575,7 @@ void compute_beta8(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,unsigned sho ...@@ -575,7 +575,7 @@ void compute_beta8(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,unsigned sho
beta_ptr[5] = _mm_srli_si128(beta128[5],1); beta_ptr[5] = _mm_srli_si128(beta128[5],1);
beta_ptr[6] = _mm_srli_si128(beta128[6],1); beta_ptr[6] = _mm_srli_si128(beta128[6],1);
beta_ptr[7] = _mm_srli_si128(beta128[7],1); beta_ptr[7] = _mm_srli_si128(beta128[7],1);
#elif defined(__arm__) #elif defined(__arm__) || defined(__aarch64__)
beta128 = (int8x16_t *)&beta[0]; beta128 = (int8x16_t *)&beta[0];
beta_ptr = (int8x16_t *)&beta[frame_length<<3]; beta_ptr = (int8x16_t *)&beta[frame_length<<3];
beta_ptr[0] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[0],8); beta_ptr[0] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[0],8);
...@@ -608,7 +608,7 @@ void compute_ext8(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,llr_t *ext, l ...@@ -608,7 +608,7 @@ void compute_ext8(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,llr_t *ext, l
__m128i m01_1,m01_2,m01_3,m01_4; __m128i m01_1,m01_2,m01_3,m01_4;
__m128i m10_1,m10_2,m10_3,m10_4; __m128i m10_1,m10_2,m10_3,m10_4;
__m128i m11_1,m11_2,m11_3,m11_4; __m128i m11_1,m11_2,m11_3,m11_4;
#elif defined(__arm__) #elif defined(__arm__) || defined(__aarch64__)
int8x16_t *alpha128=(int8x16_t *)alpha; int8x16_t *alpha128=(int8x16_t *)alpha;
int8x16_t *beta128=(int8x16_t *)beta; int8x16_t *beta128=(int8x16_t *)beta;
int8x16_t *m11_128,*m10_128,*ext_128; int8x16_t *m11_128,*m10_128,*ext_128;
...@@ -670,7 +670,7 @@ void compute_ext8(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,llr_t *ext, l ...@@ -670,7 +670,7 @@ void compute_ext8(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,llr_t *ext, l
*ext_128 = _mm_subs_epi8(m10_1,m01_1); *ext_128 = _mm_subs_epi8(m10_1,m01_1);
alpha_ptr+=8; alpha_ptr+=8;
beta_ptr+=8; beta_ptr+=8;
#elif defined(__arm__) #elif defined(__arm__) || defined(__aarch64__)
m11_128 = (int8x16_t *)&m_11[k<<4]; m11_128 = (int8x16_t *)&m_11[k<<4];
m10_128 = (int8x16_t *)&m_10[k<<4]; m10_128 = (int8x16_t *)&m_10[k<<4];
ext_128 = (int8x16_t *)&ext[k<<4]; ext_128 = (int8x16_t *)&ext[k<<4];
...@@ -820,7 +820,7 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y, ...@@ -820,7 +820,7 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
__m128i *yp128; __m128i *yp128;
__m128i tmp128[(n+8)>>3]; __m128i tmp128[(n+8)>>3];
__m128i tmp={0}, zeros=_mm_setzero_si128(); __m128i tmp={0}, zeros=_mm_setzero_si128();
#elif defined(__arm__) #elif defined(__arm__) || defined(__aarch64__)
int8x16_t *yp128; int8x16_t *yp128;
int8x16_t tmp128[(n+8)>>3]; int8x16_t tmp128[(n+8)>>3];
int8x16_t tmp, zeros=vdupq_n_s8(0); int8x16_t tmp, zeros=vdupq_n_s8(0);
...@@ -900,7 +900,7 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y, ...@@ -900,7 +900,7 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
((__m128i *)y8)[i] = _mm_packs_epi16(_mm_srai_epi16(((__m128i *)y)[j],3),_mm_srai_epi16(((__m128i *)y)[j+1],4)); ((__m128i *)y8)[i] = _mm_packs_epi16(_mm_srai_epi16(((__m128i *)y)[j],3),_mm_srai_epi16(((__m128i *)y)[j+1],4));
yp128 = (__m128i *)y8; yp128 = (__m128i *)y8;
#elif defined(__arm__) #elif defined(__arm__) || defined(__aarch64__)
int32x4_t avg=vdupq_n_s32(0); int32x4_t avg=vdupq_n_s32(0);
for (i=0; i<(3*(n>>4))+1; i++) { for (i=0; i<(3*(n>>4))+1; i++) {
...@@ -1019,7 +1019,7 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y, ...@@ -1019,7 +1019,7 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],13); tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],13);
tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],14); tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],14);
((__m128i *)systematic2)[i]=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],15); ((__m128i *)systematic2)[i]=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],15);
#elif defined(__arm__) #elif defined(__arm__) || defined(__aarch64__)
tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,0); tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,0);
tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,1); tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,1);
tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,2); tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,2);
...@@ -1067,7 +1067,7 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y, ...@@ -1067,7 +1067,7 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],15); tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],15);
decoded_bytes_interl[i]=(uint16_t) _mm_movemask_epi8(_mm_cmpgt_epi8(tmp,zeros)); decoded_bytes_interl[i]=(uint16_t) _mm_movemask_epi8(_mm_cmpgt_epi8(tmp,zeros));
((__m128i *)systematic1)[i] = _mm_adds_epi8(_mm_subs_epi8(tmp,((__m128i *)ext)[i]),((__m128i *)systematic0)[i]); ((__m128i *)systematic1)[i] = _mm_adds_epi8(_mm_subs_epi8(tmp,((__m128i *)ext)[i]),((__m128i *)systematic0)[i]);
#elif defined(__arm__) #elif defined(__arm__) || defined(__aarch64__)
tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,0); tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,0);
tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,1); tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,1);
tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,2); tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,2);
...@@ -1111,7 +1111,7 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y, ...@@ -1111,7 +1111,7 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],15); tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],15);
tmp128[i] = _mm_adds_epi8(((__m128i *)ext2)[i],((__m128i *)systematic2)[i]); tmp128[i] = _mm_adds_epi8(((__m128i *)ext2)[i],((__m128i *)systematic2)[i]);
((__m128i *)systematic1)[i] = _mm_adds_epi8(_mm_subs_epi8(tmp,((__m128i *)ext)[i]),((__m128i *)systematic0)[i]); ((__m128i *)systematic1)[i] = _mm_adds_epi8(_mm_subs_epi8(tmp,((__m128i *)ext)[i]),((__m128i *)systematic0)[i]);
#elif defined(__arm__) #elif defined(__arm__) || defined(__aarch64__)
tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,0); tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,0);
tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,1); tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,1);
tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,2); tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,2);
...@@ -1166,7 +1166,7 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y, ...@@ -1166,7 +1166,7 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
} }
} }
#elif defined(__arm__) #elif defined(__arm__) || defined(__aarch64__)
uint8x16_t *dbytes=(uint8x16_t *)decoded_bytes_interl; uint8x16_t *dbytes=(uint8x16_t *)decoded_bytes_interl;
uint16x8_t mask __attribute__((aligned(16))); uint16x8_t mask __attribute__((aligned(16)));
int n_128=n2>>7; int n_128=n2>>7;
...@@ -1208,7 +1208,7 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y, ...@@ -1208,7 +1208,7 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
tmp=_mm_insert_epi8(tmp, ((llr_t *)tmp128)[*pi6_p++],8); tmp=_mm_insert_epi8(tmp, ((llr_t *)tmp128)[*pi6_p++],8);
tmp=_mm_cmpgt_epi8(tmp,zeros); tmp=_mm_cmpgt_epi8(tmp,zeros);
((uint16_t *)decoded_bytes)[i]=(uint16_t)_mm_movemask_epi8(tmp); ((uint16_t *)decoded_bytes)[i]=(uint16_t)_mm_movemask_epi8(tmp);
#elif defined(__arm__) #elif defined(__arm__) || defined(__aarch64__)
tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,7); tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,7);
tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,6); tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,6);
tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,5); tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,5);
...@@ -1286,7 +1286,7 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y, ...@@ -1286,7 +1286,7 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
__m128i *ext_128=(__m128i *) ext; __m128i *ext_128=(__m128i *) ext;
__m128i *s1_128=(__m128i *) systematic1; __m128i *s1_128=(__m128i *) systematic1;
__m128i *s0_128=(__m128i *) systematic0; __m128i *s0_128=(__m128i *) systematic0;
#elif defined(__arm__) #elif defined(__arm__) || defined(__aarch64__)
int8x16_t *ext_128=(int8x16_t *) ext; int8x16_t *ext_128=(int8x16_t *) ext;
int8x16_t *s1_128=(int8x16_t *) systematic1; int8x16_t *s1_128=(int8x16_t *) systematic1;
int8x16_t *s0_128=(int8x16_t *) systematic0; int8x16_t *s0_128=(int8x16_t *) systematic0;
...@@ -1296,7 +1296,7 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y, ...@@ -1296,7 +1296,7 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
for (i=0; i<myloop; i++) { for (i=0; i<myloop; i++) {
#if defined(__x86_64__) || defined(__i386__) #if defined(__x86_64__) || defined(__i386__)
*ext_128=_mm_adds_epi8(_mm_subs_epi8(*ext_128,*s1_128++),*s0_128++); *ext_128=_mm_adds_epi8(_mm_subs_epi8(*ext_128,*s1_128++),*s0_128++);
#elif defined(__arm__) #elif defined(__arm__) || defined(__aarch64__)
*ext_128=vqaddq_s8(vqsubq_s8(*ext_128,*s1_128++),*s0_128++); *ext_128=vqaddq_s8(vqsubq_s8(*ext_128,*s1_128++),*s0_128++);
#endif #endif
ext_128++; ext_128++;
......
...@@ -36,11 +36,7 @@ ...@@ -36,11 +36,7 @@
#define MAX_BLOCK_LENGTH 8448 #define MAX_BLOCK_LENGTH 8448
#ifndef malloc16 #ifndef malloc16
# ifdef __AVX2__
# define malloc16(x) memalign(32,x) # define malloc16(x) memalign(32,x)
# else
# define malloc16(x) memalign(16,x)
# endif
#endif #endif
#define NR_LDPC_PROFILER_DETAIL #define NR_LDPC_PROFILER_DETAIL
......
...@@ -44,8 +44,7 @@ ...@@ -44,8 +44,7 @@
#include "crcext.h" #include "crcext.h"
#include "types.h" #include "types.h"
#include <immintrin.h> #include "PHY/sse_intrin.h"
#include <wmmintrin.h>
/** /**
* PCLMULQDQ CRC computation context structure * PCLMULQDQ CRC computation context structure
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <stdint.h> #include <stdint.h>
#include <immintrin.h> #include "PHY/sse_intrin.h"
#include "../../nrLDPCdecoder_defs.h" #include "../../nrLDPCdecoder_defs.h"
#include "../../nrLDPC_types.h" #include "../../nrLDPC_types.h"
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <stdint.h> #include <stdint.h>
#include <immintrin.h> #include "PHY/sse_intrin.h"
#include "../../nrLDPCdecoder_defs.h" #include "../../nrLDPCdecoder_defs.h"
#include "../../nrLDPC_types.h" #include "../../nrLDPC_types.h"
...@@ -44,8 +44,8 @@ void nrLDPC_bnProcPc_BG2_generator_AVX512(const char *dir, int R) ...@@ -44,8 +44,8 @@ void nrLDPC_bnProcPc_BG2_generator_AVX512(const char *dir, int R)
abort(); abort();
} }
// fprintf(fd,"#include <stdint.h>\n"); // fprintf(fd,"#include <stdint.h>\n");
//fprintf(fd,"#include <immintrin.h>\n"); // fprintf(fd,"#include \"PHY/sse_intrin.h\"\n");
fprintf(fd,"static inline void nrLDPC_bnProcPc_BG2_R%s_AVX512(int8_t* bnProcBuf,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) {\n",ratestr[R]); fprintf(fd,"static inline void nrLDPC_bnProcPc_BG2_R%s_AVX512(int8_t* bnProcBuf,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) {\n",ratestr[R]);
const uint8_t* lut_numBnInBnGroups; const uint8_t* lut_numBnInBnGroups;
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#include <stdlib.h> #include <stdlib.h>
#include <stdio.h> #include <stdio.h>
#include <stdint.h> #include <stdint.h>
#include <immintrin.h> #include "PHY/sse_intrin.h"
#include "../../nrLDPCdecoder_defs.h" #include "../../nrLDPCdecoder_defs.h"
#include "../../nrLDPC_types.h" #include "../../nrLDPC_types.h"
...@@ -45,7 +45,7 @@ void nrLDPC_bnProc_BG1_generator_AVX512(const char *dir, int R) ...@@ -45,7 +45,7 @@ void nrLDPC_bnProc_BG1_generator_AVX512(const char *dir, int R)
} }
//fprintf(fd,"#include <stdint.h>\n"); //fprintf(fd,"#include <stdint.h>\n");
//fprintf(fd,"#include <immintrin.h>\n"); //fprintf(fd,"#include \"PHY/sse_intrin.h\"\n");
fprintf(fd,"static inline void nrLDPC_bnProc_BG1_R%s_AVX512(int8_t* bnProcBuf,int8_t* bnProcBufRes, int8_t* llrRes, uint16_t Z ) {\n", ratestr[R]); fprintf(fd,"static inline void nrLDPC_bnProc_BG1_R%s_AVX512(int8_t* bnProcBuf,int8_t* bnProcBufRes, int8_t* llrRes, uint16_t Z ) {\n", ratestr[R]);
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <stdint.h> #include <stdint.h>
#include <immintrin.h> #include "PHY/sse_intrin.h"
#include "../../nrLDPCdecoder_defs.h" #include "../../nrLDPCdecoder_defs.h"
#include "../../nrLDPC_types.h" #include "../../nrLDPC_types.h"
...@@ -45,7 +45,7 @@ void nrLDPC_bnProc_BG2_generator_AVX512(const char *dir, int R) ...@@ -45,7 +45,7 @@ void nrLDPC_bnProc_BG2_generator_AVX512(const char *dir, int R)
} }
fprintf(fd,"#include <stdint.h>\n"); fprintf(fd,"#include <stdint.h>\n");
fprintf(fd,"#include <immintrin.h>\n"); fprintf(fd,"#include \"PHY/sse_intrin.h\"\n");
fprintf(fd,"void nrLDPC_bnProc_BG2_R%s_AVX512(int8_t* bnProcBuf,int8_t* bnProcBufRes, int8_t* llrRes, uint16_t Z ) {\n",ratestr[R]); fprintf(fd,"void nrLDPC_bnProc_BG2_R%s_AVX512(int8_t* bnProcBuf,int8_t* bnProcBufRes, int8_t* llrRes, uint16_t Z ) {\n",ratestr[R]);
const uint8_t* lut_numBnInBnGroups; const uint8_t* lut_numBnInBnGroups;
......
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -140,7 +140,7 @@ int ldpc_encoder_orig(unsigned char *test_input,unsigned char *channel_input,int ...@@ -140,7 +140,7 @@ int ldpc_encoder_orig(unsigned char *test_input,unsigned char *channel_input,int
shift=5; // AVX2 - 256-bit SIMD shift=5; // AVX2 - 256-bit SIMD
mask=31; mask=31;
strcpy(data_type,"__m256i"); strcpy(data_type,"__m256i");
strcpy(xor_command,"_mm256_xor_si256"); strcpy(xor_command,"simde_mm256_xor_si256");
} }
else if ((Zc&15)==0) { else if ((Zc&15)==0) {
shift=4; // SSE4 - 128-bit SIMD shift=4; // SSE4 - 128-bit SIMD
......
This diff is collapsed.
...@@ -193,7 +193,7 @@ void phy_viterbi_dot11_sse2(char *y,unsigned char *decoded_bytes,unsigned short ...@@ -193,7 +193,7 @@ void phy_viterbi_dot11_sse2(char *y,unsigned char *decoded_bytes,unsigned short
__m128i *m0_ptr,*m1_ptr,*TB_ptr = &TB[offset<<2]; __m128i *m0_ptr,*m1_ptr,*TB_ptr = &TB[offset<<2];
#elif defined(__arm__) #elif defined(__arm__) || defined(__aarch64__)
uint8x16x2_t TB[2*4095*8]; // 2 int8x16_t per input bit, 8 bits / byte, 4095 is largest packet size in bytes uint8x16x2_t TB[2*4095*8]; // 2 int8x16_t per input bit, 8 bits / byte, 4095 is largest packet size in bytes
uint8x16_t even0_30a,even0_30b,even32_62a,even32_62b,odd1_31a,odd1_31b,odd33_63a,odd33_63b,TBeven0_30,TBeven32_62,TBodd1_31,TBodd33_63; uint8x16_t even0_30a,even0_30b,even32_62a,even32_62b,odd1_31a,odd1_31b,odd33_63a,odd33_63b,TBeven0_30,TBeven32_62,TBodd1_31,TBodd33_63;
...@@ -224,7 +224,7 @@ void phy_viterbi_dot11_sse2(char *y,unsigned char *decoded_bytes,unsigned short ...@@ -224,7 +224,7 @@ void phy_viterbi_dot11_sse2(char *y,unsigned char *decoded_bytes,unsigned short
metrics48_63 = _mm_setzero_si128(); metrics48_63 = _mm_setzero_si128();
} }
#elif defined(__arm__) #elif defined(__arm__) || defined(__aarch64__)
if (offset == 0) { if (offset == 0) {
// set initial metrics // set initial metrics
...@@ -318,7 +318,7 @@ void phy_viterbi_dot11_sse2(char *y,unsigned char *decoded_bytes,unsigned short ...@@ -318,7 +318,7 @@ void phy_viterbi_dot11_sse2(char *y,unsigned char *decoded_bytes,unsigned short
metrics16_31 = _mm_subs_epu8(metrics16_31,min_state); metrics16_31 = _mm_subs_epu8(metrics16_31,min_state);
metrics32_47 = _mm_subs_epu8(metrics32_47,min_state); metrics32_47 = _mm_subs_epu8(metrics32_47,min_state);
metrics48_63 = _mm_subs_epu8(metrics48_63,min_state); metrics48_63 = _mm_subs_epu8(metrics48_63,min_state);
#elif defined(__arm__) #elif defined(__arm__) || defined(__aarch64__)
m0_ptr = (uint8x16_t *)&m0_table[table_offset]; m0_ptr = (uint8x16_t *)&m0_table[table_offset];
m1_ptr = (uint8x16_t *)&m1_table[table_offset]; m1_ptr = (uint8x16_t *)&m1_table[table_offset];
......
...@@ -136,7 +136,7 @@ void phy_viterbi_lte_sse2(int8_t *y,uint8_t *decoded_bytes,uint16_t n) ...@@ -136,7 +136,7 @@ void phy_viterbi_lte_sse2(int8_t *y,uint8_t *decoded_bytes,uint16_t n)
__m128i min_state,min_state2; __m128i min_state,min_state2;
#elif defined(__arm__) #elif defined(__arm__) || defined(__aarch64__)
uint8x16x2_t TB[2*8192]; // 2 int8x16_t per input bit, 8 bits / byte, 8192 is largest packet size in bits uint8x16x2_t TB[2*8192]; // 2 int8x16_t per input bit, 8 bits / byte, 8192 is largest packet size in bits
uint8x16_t even0_30a,even0_30b,even32_62a,even32_62b,odd1_31a,odd1_31b,odd33_63a,odd33_63b,TBeven0_30,TBeven32_62,TBodd1_31,TBodd33_63; uint8x16_t even0_30a,even0_30b,even32_62a,even32_62b,odd1_31a,odd1_31b,odd33_63a,odd33_63b,TBeven0_30,TBeven32_62,TBodd1_31,TBodd33_63;
...@@ -165,7 +165,7 @@ void phy_viterbi_lte_sse2(int8_t *y,uint8_t *decoded_bytes,uint16_t n) ...@@ -165,7 +165,7 @@ void phy_viterbi_lte_sse2(int8_t *y,uint8_t *decoded_bytes,uint16_t n)
metrics16_31 = _mm_setzero_si128(); metrics16_31 = _mm_setzero_si128();
metrics32_47 = _mm_setzero_si128(); metrics32_47 = _mm_setzero_si128();
metrics48_63 = _mm_setzero_si128(); metrics48_63 = _mm_setzero_si128();
#elif defined(__arm__) #elif defined(__arm__) || defined(__aarch64__)
metrics0_31.val[0] = vdupq_n_u8(0); metrics0_31.val[0] = vdupq_n_u8(0);
metrics0_31.val[1] = vdupq_n_u8(0); metrics0_31.val[1] = vdupq_n_u8(0);
metrics32_63.val[0] = vdupq_n_u8(0); metrics32_63.val[0] = vdupq_n_u8(0);
...@@ -259,7 +259,7 @@ void phy_viterbi_lte_sse2(int8_t *y,uint8_t *decoded_bytes,uint16_t n) ...@@ -259,7 +259,7 @@ void phy_viterbi_lte_sse2(int8_t *y,uint8_t *decoded_bytes,uint16_t n)
metrics16_31 = _mm_subs_epu8(metrics16_31,min_state); metrics16_31 = _mm_subs_epu8(metrics16_31,min_state);
metrics32_47 = _mm_subs_epu8(metrics32_47,min_state); metrics32_47 = _mm_subs_epu8(metrics32_47,min_state);
metrics48_63 = _mm_subs_epu8(metrics48_63,min_state); metrics48_63 = _mm_subs_epu8(metrics48_63,min_state);
#elif defined(__arm__) #elif defined(__arm__) || defined(__aarch64__)
m0_ptr = (uint8x16_t *)&m0_table[table_offset]; m0_ptr = (uint8x16_t *)&m0_table[table_offset];
m1_ptr = (uint8x16_t *)&m1_table[table_offset]; m1_ptr = (uint8x16_t *)&m1_table[table_offset];
...@@ -353,7 +353,7 @@ void phy_viterbi_lte_sse2(int8_t *y,uint8_t *decoded_bytes,uint16_t n) ...@@ -353,7 +353,7 @@ void phy_viterbi_lte_sse2(int8_t *y,uint8_t *decoded_bytes,uint16_t n)
} }
#elif defined(__arm__) #elif defined(__arm__) || defined(__aarch64__)
for (s=0; s<16; s++) for (s=0; s<16; s++)
if (((uint8_t *)&metrics0_31.val[0])[s] > maxm) { if (((uint8_t *)&metrics0_31.val[0])[s] > maxm) {
maxm = ((uint8_t *)&metrics0_31.val[0])[s]; maxm = ((uint8_t *)&metrics0_31.val[0])[s];
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment