diff --git a/CMakeLists.txt b/CMakeLists.txt index a741961c..76bca813 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,7 +3,7 @@ project (vectorscan C CXX) set (HS_MAJOR_VERSION 5) set (HS_MINOR_VERSION 4) -set (HS_PATCH_VERSION 3) +set (HS_PATCH_VERSION 5) set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION}) set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) @@ -128,11 +128,9 @@ CMAKE_DEPENDENT_OPTION(DUMP_SUPPORT "Dump code support; normally on, except in r CMAKE_DEPENDENT_OPTION(DISABLE_ASSERTS "Disable assert(); Asserts are enabled in debug builds, disabled in release builds" OFF "NOT RELEASE_BUILD" ON) -option(BUILD_AVX512 "Experimental: support avx512 in the fat runtime" - OFF) +option(BUILD_AVX512 "Experimental: support avx512 in the fat runtime" OFF) -option(BUILD_AVX512VBMI "Experimental: support avx512vbmi in the fat runtime" - OFF) +option(BUILD_AVX512VBMI "Experimental: support avx512vbmi in the fat runtime" OFF) if (BUILD_AVX512VBMI) set(BUILD_AVX512 ON) @@ -140,47 +138,95 @@ endif () # TODO: per platform config files? - # remove CMake's idea of optimisation - foreach (CONFIG ${CMAKE_BUILD_TYPE} ${CMAKE_CONFIGURATION_TYPES}) - string(REGEX REPLACE "-O[^ ]*" "" CMAKE_C_FLAGS_${CONFIG} "${CMAKE_C_FLAGS_${CONFIG}}") - string(REGEX REPLACE "-O[^ ]*" "" CMAKE_CXX_FLAGS_${CONFIG} "${CMAKE_CXX_FLAGS_${CONFIG}}") - endforeach () +# remove CMake's idea of optimisation +foreach (CONFIG ${CMAKE_BUILD_TYPE} ${CMAKE_CONFIGURATION_TYPES}) + string(REGEX REPLACE "-O[^ ]*" "" CMAKE_C_FLAGS_${CONFIG} "${CMAKE_C_FLAGS_${CONFIG}}") + string(REGEX REPLACE "-O[^ ]*" "" CMAKE_CXX_FLAGS_${CONFIG} "${CMAKE_CXX_FLAGS_${CONFIG}}") +endforeach () - if (CMAKE_COMPILER_IS_GNUCC AND NOT CROSS_COMPILE_AARCH64 AND NOT ARCH_PPC64EL) - message(STATUS "gcc version ${CMAKE_C_COMPILER_VERSION}") - # If gcc doesn't recognise the host cpu, then mtune=native becomes - # generic, which isn't very good in some cases. march=native looks at - # cpuid info and then chooses the best microarch it can (and replaces - # the flag), so use that for tune. +if (CMAKE_C_COMPILER_ID MATCHES "Intel") + set(SKYLAKE_FLAG "-xCORE-AVX512") +else () + set(SKYLAKE_FLAG "-march=skylake-avx512") + set(ICELAKE_FLAG "-march=icelake-server") +endif () - # arg1 might exist if using ccache - string (STRIP "${CMAKE_C_COMPILER_ARG1}" CC_ARG1) - set (EXEC_ARGS ${CC_ARG1} -c -Q --help=target -march=native -mtune=native) - execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS} - OUTPUT_VARIABLE _GCC_OUTPUT) - string(FIND "${_GCC_OUTPUT}" "march" POS) - string(SUBSTRING "${_GCC_OUTPUT}" ${POS} -1 _GCC_OUTPUT) - string(REGEX REPLACE "march=[ \t]*([^ \n]*)[ \n].*" "\\1" - GNUCC_ARCH "${_GCC_OUTPUT}") +if(ARCH_PPC64EL) + set(ARCH_FLAG mcpu) +else() + set(ARCH_FLAG march) +endif() - if (ARCH_IA32 OR ARCH_X86_64) - # test the parsed flag - set (EXEC_ARGS ${CC_ARG1} -E - -mtune=${GNUCC_ARCH}) - execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS} - OUTPUT_QUIET ERROR_QUIET - INPUT_FILE /dev/null - RESULT_VARIABLE GNUCC_TUNE_TEST) - if (NOT GNUCC_TUNE_TEST EQUAL 0) - message(SEND_ERROR "Something went wrong determining gcc tune: -mtune=${GNUCC_ARCH} not valid") - endif() - set(TUNE_FLAG ${GNUCC_ARCH}) - else() - set(TUNE_FLAG native) - endif() - elseif (NOT TUNE_FLAG) +# Detect best GNUCC_ARCH to tune for +if (CMAKE_COMPILER_IS_GNUCC AND NOT CROSS_COMPILE) + message(STATUS "gcc version ${CMAKE_C_COMPILER_VERSION}") + + # If gcc doesn't recognise the host cpu, then mtune=native becomes + # generic, which isn't very good in some cases. march=native looks at + # cpuid info and then chooses the best microarch it can (and replaces + # the flag), so use that for tune. + + # arg1 might exist if using ccache + string (STRIP "${CMAKE_C_COMPILER_ARG1}" CC_ARG1) + set (EXEC_ARGS ${CC_ARG1} -c -Q --help=target -${ARCH_FLAG}=native -mtune=native) + execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS} + OUTPUT_VARIABLE _GCC_OUTPUT) + string(FIND "${_GCC_OUTPUT}" "${ARCH_FLAG}" POS) + string(SUBSTRING "${_GCC_OUTPUT}" ${POS} -1 _GCC_OUTPUT) + string(REGEX REPLACE "${ARCH_FLAG}=[ \t]*([^ \n]*)[ \n].*" "\\1" GNUCC_ARCH "${_GCC_OUTPUT}") + + # test the parsed flag + set (EXEC_ARGS ${CC_ARG1} -E - -mtune=${GNUCC_ARCH}) + execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS} + OUTPUT_QUIET ERROR_QUIET + INPUT_FILE /dev/null + RESULT_VARIABLE GNUCC_TUNE_TEST) + if (NOT GNUCC_TUNE_TEST EQUAL 0) + message(WARNING "Something went wrong determining gcc tune: -mtune=${GNUCC_ARCH} not valid, falling back to -mtune=native") set(TUNE_FLAG native) + else() + set(TUNE_FLAG ${GNUCC_ARCH}) + message(STATUS "gcc will tune for ${GNUCC_ARCH}, ${TUNE_FLAG}") endif() +elseif (CMAKE_COMPILER_IS_CLANG AND NOT CROSS_COMPILE) + if (ARCH_IA32 OR ARCH_X86_64) + set(GNUCC_ARCH native) + set(TUNE_FLAG generic) + elseif(ARCH_AARCH64) + set(GNUCC_ARCH armv8) + set(TUNE_FLAG generic) + elseif(ARCH_ARM32) + set(GNUCC_ARCH armv7a) + set(TUNE_FLAG generic) + else() + set(GNUCC_ARCH native) + set(TUNE_FLAG generic) + endif() + message(STATUS "clang will tune for ${GNUCC_ARCH}, ${TUNE_FLAG}") +elseif (CROSS_COMPILE) + set(GNUCC_ARCH generic) + set(TUNE_FLAG generic) +endif() +if (ARCH_IA32 OR ARCH_X86_64) + if (NOT FAT_RUNTIME) + if (BUILD_AVX512) + set(ARCH_C_FLAGS "${SKYLAKE_FLAG}") + set(ARCH_CXX_FLAGS "${SKYLAKE_FLAG}") + elseif (BUILD_AVX2) + set(ARCH_C_FLAGS "-mavx2") + set(ARCH_CXX_FLAGS "-mavx2") + else() + set(ARCH_C_FLAGS "-msse4.2") + set(ARCH_CXX_FLAGS "-msse4.2") + endif() + else() + set(ARCH_C_FLAGS "-msse4.2") + set(ARCH_CXX_FLAGS "-msse4.2") + endif() +endif() + +if (ARCH_AARCH64) if (BUILD_SVE2_BITPERM) set(GNUCC_ARCH "${GNUCC_ARCH}+sve2-bitperm") elseif (BUILD_SVE2) @@ -188,92 +234,89 @@ endif () elseif (BUILD_SVE) set(GNUCC_ARCH "${GNUCC_ARCH}+sve") endif () +endif(ARCH_AARCH64) - # compiler version checks TODO: test more compilers - if (CMAKE_COMPILER_IS_GNUCXX) - set(GNUCXX_MINVER "4.8.1") - message(STATUS "g++ version ${CMAKE_CXX_COMPILER_VERSION}") - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS GNUCXX_MINVER) - message(FATAL_ERROR "A minimum of g++ ${GNUCXX_MINVER} is required for C++11 support") - endif() - endif() - - if(RELEASE_BUILD) - if (NOT CMAKE_BUILD_TYPE MATCHES MINSIZEREL) - set(OPT_C_FLAG "-O3") - set(OPT_CXX_FLAG "-O3") - else () - set(OPT_C_FLAG "-Os") - set(OPT_CXX_FLAG "-Os") - endif () - else() - set(OPT_C_FLAG "-O0") - set(OPT_CXX_FLAG "-O0") - endif(RELEASE_BUILD) - - # set compiler flags - more are tested and added later - set(EXTRA_C_FLAGS "${OPT_C_FLAG} -std=c17 -Wall -Wextra -Wshadow -Wcast-qual -fno-strict-aliasing") - set(EXTRA_CXX_FLAGS "${OPT_CXX_FLAG} -std=c++17 -Wall -Wextra -Wshadow -Wswitch -Wreturn-type -Wcast-qual -Wno-deprecated -Wnon-virtual-dtor -fno-strict-aliasing -fno-new-ttp-matching") - - if (NOT RELEASE_BUILD) - # -Werror is most useful during development, don't potentially break - # release builds - set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Werror") - set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Werror") - endif() - - if (DISABLE_ASSERTS) - set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -DNDEBUG") - set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -DNDEBUG") - endif() +set(ARCH_C_FLAGS "-${ARCH_FLAG}=${GNUCC_ARCH} -mtune=${TUNE_FLAG} ${ARCH_C_FLAGS}") +set(ARCH_CXX_FLAGS "-${ARCH_FLAG}=${GNUCC_ARCH} -mtune=${TUNE_FLAG} ${ARCH_CXX_FLAGS}") +#if (ARCH_IA32 OR ARCH_X86_64 OR ARCH_ARM32 OR ARCH_AARCH64) +# if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*) +# set(ARCH_C_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}") +# endif() +# if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*) +# set(ARCH_CXX_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}") +# endif() +#endif() - if (ARCH_IA32 OR ARCH_X86_64 OR ARCH_ARM32 OR ARCH_AARCH64) - if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*) - set(ARCH_C_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}") - endif() - - if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*) - set(ARCH_CXX_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}") - endif() - endif() - - if(ARCH_PPC64EL) - if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*) - set(ARCH_C_FLAGS "-mtune=${TUNE_FLAG}") - endif() - if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*) - set(ARCH_CXX_FLAGS "-mtune=${TUNE_FLAG}") - endif() - endif() +#if(ARCH_PPC64EL) +# if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*) +# set(ARCH_C_FLAGS "-mtune=${TUNE_FLAG}") +# endif() +# if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*) +# set(ARCH_CXX_FLAGS "-mtune=${TUNE_FLAG}") +# endif() +#endif() - if(CMAKE_COMPILER_IS_GNUCC) - # spurious warnings? - set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-array-bounds -Wno-maybe-uninitialized") +# compiler version checks TODO: test more compilers +if (CMAKE_COMPILER_IS_GNUCXX) + set(GNUCXX_MINVER "9") + message(STATUS "g++ version ${CMAKE_CXX_COMPILER_VERSION}") + if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS GNUCXX_MINVER) + message(FATAL_ERROR "A minimum of g++ ${GNUCXX_MINVER} is required for C++17 support") endif() +endif() - if(CMAKE_COMPILER_IS_GNUCXX) - set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-maybe-uninitialized") - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0) - set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fabi-version=0") - endif () - # don't complain about abi - set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-abi") - set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-abi") - endif() - - if (NOT(ARCH_IA32 AND RELEASE_BUILD)) - set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -fno-omit-frame-pointer") - set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fno-omit-frame-pointer") - endif() - - - if (CMAKE_C_COMPILER_ID MATCHES "Intel") - set(SKYLAKE_FLAG "-xCORE-AVX512") +if(RELEASE_BUILD) + if (NOT CMAKE_BUILD_TYPE MATCHES MINSIZEREL) + set(OPT_C_FLAG "-O3") + set(OPT_CXX_FLAG "-O3") else () - set(SKYLAKE_FLAG "-march=skylake-avx512") - set(ICELAKE_FLAG "-march=icelake-server") + set(OPT_C_FLAG "-Os") + set(OPT_CXX_FLAG "-Os") endif () +else() + set(OPT_C_FLAG "-O0") + set(OPT_CXX_FLAG "-O0") +endif(RELEASE_BUILD) + +# set compiler flags - more are tested and added later +set(EXTRA_C_FLAGS "${OPT_C_FLAG} -std=c17 -Wall -Wextra -Wshadow -Wcast-qual -fno-strict-aliasing") +set(EXTRA_CXX_FLAGS "${OPT_CXX_FLAG} -std=c++17 -Wall -Wextra -Wshadow -Wswitch -Wreturn-type -Wcast-qual -Wno-deprecated -Wnon-virtual-dtor -fno-strict-aliasing") +if (NOT CMAKE_COMPILER_IS_CLANG) + set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fno-new-ttp-matching") +endif() + +if (NOT RELEASE_BUILD) + # -Werror is most useful during development, don't potentially break + # release builds + set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Werror") + set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Werror") +endif() + +if (DISABLE_ASSERTS) + set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -DNDEBUG") + set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -DNDEBUG") +endif() + +if(CMAKE_COMPILER_IS_GNUCC) + # spurious warnings? + set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-array-bounds -Wno-maybe-uninitialized") +endif() + +if(CMAKE_COMPILER_IS_GNUCXX) + set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-maybe-uninitialized") + if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0) + set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fabi-version=0") + endif () + # don't complain about abi + set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-abi") + set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-abi") +endif() + +if (NOT(ARCH_IA32 AND RELEASE_BUILD)) + set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -fno-omit-frame-pointer") + set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fno-omit-frame-pointer") +endif() CHECK_INCLUDE_FILES(unistd.h HAVE_UNISTD_H) if (ARCH_IA32 OR ARCH_X86_64) @@ -289,8 +332,6 @@ elseif (ARCH_ARM32 OR ARCH_AARCH64) message(FATAL_ERROR "arm_sve.h is required to build for SVE.") endif() endif() - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -flax-vector-conversions") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -flax-vector-conversions") elseif (ARCH_PPC64EL) CHECK_INCLUDE_FILE_CXX(altivec.h HAVE_C_PPC64EL_ALTIVEC_H) endif() @@ -318,8 +359,7 @@ if (CMAKE_SYSTEM_NAME MATCHES "Linux") # This is a Linux-only feature for now - requires platform support # elsewhere message(STATUS "generator is ${CMAKE_GENERATOR}") - if (CMAKE_C_COMPILER_ID MATCHES "Clang" AND - CMAKE_C_COMPILER_VERSION VERSION_LESS "3.9") + if (CMAKE_C_COMPILER_IS_CLANG AND CMAKE_C_COMPILER_VERSION VERSION_LESS "3.9") message (STATUS "Clang v3.9 or higher required for fat runtime, cannot build fat runtime") set (FAT_RUNTIME_REQUISITES FALSE) elseif (NOT (CMAKE_GENERATOR MATCHES "Unix Makefiles" OR @@ -343,7 +383,10 @@ include (${CMAKE_MODULE_PATH}/arch.cmake) # testing a builtin takes a little more work CHECK_C_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CC_BUILTIN_ASSUME_ALIGNED) CHECK_CXX_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CXX_BUILTIN_ASSUME_ALIGNED) -CHECK_C_SOURCE_COMPILES("int main(void) { __builtin_constant_p(0); }" HAVE__BUILTIN_CONSTANT_P) +# Clang does not use __builtin_constant_p() the same way as gcc +if (NOT CMAKE_COMPILER_IS_CLANG) + CHECK_C_SOURCE_COMPILES("int main(void) { __builtin_constant_p(0); }" HAVE__BUILTIN_CONSTANT_P) +endif() set(C_FLAGS_TO_CHECK # Variable length arrays are way bad, most especially at run time @@ -442,19 +485,22 @@ if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD") set(FREEBSD true) endif(CMAKE_SYSTEM_NAME MATCHES "FreeBSD") -if (NOT FAT_RUNTIME) - if (CROSS_COMPILE_AARCH64) + +if (FAT_RUNTIME) + if (NOT (ARCH_IA32 OR ARCH_X86_64)) + message(FATAL_ERROR "Fat runtime is not supported on non-Intel architectures") + else() + message(STATUS "Building runtime for multiple microarchitectures") + endif() +else() + if (CROSS_COMPILE) message(STATUS "Building for target CPU: ${ARCH_C_FLAGS}") else() message(STATUS "Building for current host CPU: ${ARCH_C_FLAGS}") endif() - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ARCH_C_FLAGS}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_CXX_FLAGS}") -else() - message(STATUS "Building runtime for multiple microarchitectures") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") endif() +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ARCH_C_FLAGS}") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_CXX_FLAGS}") add_subdirectory(util) add_subdirectory(doc/dev-reference) @@ -1171,10 +1217,6 @@ if (NOT FAT_RUNTIME) set_target_properties(hs_runtime PROPERTIES LINKER_LANGUAGE C) add_library(hs_compile OBJECT ${hs_compile_SRCS}) - if (ARCH_IA32) - set_target_properties(hs_compile PROPERTIES COMPILE_FLAGS "-mssse3") - endif (ARCH_IA32) - add_library(hs STATIC src/hs_version.c src/hs_valid_platform.c @@ -1205,14 +1247,14 @@ else (FAT_RUNTIME) add_library(hs_exec_core2 OBJECT ${hs_exec_SRCS}) list(APPEND RUNTIME_LIBS $) set_target_properties(hs_exec_core2 PROPERTIES - COMPILE_FLAGS "-march=core2" + COMPILE_FLAGS "-march=core2 -msse4.2" RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} core2 ${CMAKE_MODULE_PATH}/keep.syms.in" ) add_library(hs_exec_corei7 OBJECT ${hs_exec_SRCS}) list(APPEND RUNTIME_LIBS $) set_target_properties(hs_exec_corei7 PROPERTIES - COMPILE_FLAGS "-march=corei7 -mssse3" + COMPILE_FLAGS "-march=corei7 -msse4.2" RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} corei7 ${CMAKE_MODULE_PATH}/keep.syms.in" ) @@ -1254,10 +1296,6 @@ else (FAT_RUNTIME) ${RUNTIME_LIBS}) set_target_properties(hs_runtime PROPERTIES LINKER_LANGUAGE C) add_library(hs_compile OBJECT ${hs_compile_SRCS}) - if (ARCH_IA32 OR ARCH_X86_64) - set_target_properties(hs_exec_common PROPERTIES COMPILE_FLAGS "-mssse3") - set_target_properties(hs_compile PROPERTIES COMPILE_FLAGS "-mssse3") - endif () # we want the static lib for testing add_library(hs STATIC src/hs_version.c src/hs_valid_platform.c @@ -1274,14 +1312,14 @@ else (FAT_RUNTIME) add_library(hs_exec_shared_core2 OBJECT ${hs_exec_SRCS}) list(APPEND RUNTIME_SHLIBS $) set_target_properties(hs_exec_shared_core2 PROPERTIES - COMPILE_FLAGS "-march=core2" + COMPILE_FLAGS "-march=core2 -msse4.2" POSITION_INDEPENDENT_CODE TRUE RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} core2 ${CMAKE_MODULE_PATH}/keep.syms.in" ) add_library(hs_exec_shared_corei7 OBJECT ${hs_exec_SRCS}) list(APPEND RUNTIME_SHLIBS $) set_target_properties(hs_exec_shared_corei7 PROPERTIES - COMPILE_FLAGS "-march=corei7 -mssse3" + COMPILE_FLAGS "-march=corei7 -msse4.2" POSITION_INDEPENDENT_CODE TRUE RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} corei7 ${CMAKE_MODULE_PATH}/keep.syms.in" ) diff --git a/Jenkinsfile b/Jenkinsfile index 1883f43a..3dbef5b6 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,22 +1,590 @@ pipeline { - agent { - node { - label 'x86' - } - - } - stages { - stage('Release, SSE') { - agent { - node { - label 'x86' + agent none + stages { + stage("Build") { + failFast true + parallel { + stage("Release/SSE") { + agent { label "x86" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-release-SSE', buildType: 'Release', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=no -DBUILD_AVX512=no -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-release-SSE/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-release-SSE/bin/unit-hyperscan' + } + } + } + } + stage("Release/AVX2") { + agent { label "x86" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-release-AVX2', buildType: 'Release', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=no -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-release-AVX2/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-release-AVX2/bin/unit-hyperscan' + } + } + } + } + stage("Release/AVX512") { + agent { label "x86" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-release-AVX512', buildType: 'Release', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=yes -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-release-AVX512/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-release-AVX512/bin/unit-hyperscan' + } + } + } + } + stage("Release/FAT") { + agent { label "x86" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-release-fat', buildType: 'Release', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=yes -DFAT_RUNTIME=yes', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]] + } + } + stage("Test") { + steps { + sh 'build-release-fat/bin/unit-hyperscan' + } + } + } + } + stage("Debug/SSE") { + agent { label "x86" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-debug-SSE', buildType: 'Debug', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=no -DBUILD_AVX512=no -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-debug-SSE/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-debug-SSE/bin/unit-hyperscan' + } + } + } + } + stage("Debug/AVX2") { + agent { label "x86" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-debug-AVX2', buildType: 'Debug', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=no -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-debug-AVX2/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-debug-AVX2/bin/unit-hyperscan' + } + } + } + } + stage("Debug/AVX512") { + agent { label "x86" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-debug-AVX512', buildType: 'Debug', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=yes -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-debug-AVX512/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-debug-AVX512/bin/unit-hyperscan' + } + } + } + } + stage("Debug/FAT") { + agent { label "x86" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-debug-fat', buildType: 'Debug', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=yes -DFAT_RUNTIME=yes', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]] + } + } + stage("Test") { + steps { + sh 'build-debug-fat/bin/unit-hyperscan' + } + } + } + } + stage("Release/ARM") { + agent { label "arm" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-release-arm', buildType: 'Release', cleanBuild: true, cmakeArgs: '', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-release-arm/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-release-arm/bin/unit-hyperscan' + } + } + } + } + stage("Debug/ARM") { + agent { label "arm" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-debug-arm', buildType: 'Debug', cleanBuild: true, cmakeArgs: '', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-debug-arm/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-debug-arm/bin/unit-hyperscan' + } + } + } + } + stage("Release/Power") { + agent { label "power" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-release-power', buildType: 'Release', cleanBuild: true, cmakeArgs: '', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-release-power/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-release-power/bin/unit-hyperscan' + } + } + } + } + stage("Debug/Power") { + agent { label "power" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-debug-power', buildType: 'Debug', cleanBuild: true, cmakeArgs: '', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-debug-power/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-debug-power/bin/unit-hyperscan' + } + } + } + } + stage("Clang-Release/SSE") { + agent { label "x86" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-clang-release-SSE', buildType: 'Release', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=no -DBUILD_AVX512=no -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-clang-release-SSE/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-clang-release-SSE/bin/unit-hyperscan' + } + } + } + } + stage("Clang-Release/AVX2") { + agent { label "x86" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-clang-release-AVX2', buildType: 'Release', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=no -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-clang-release-AVX2/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-clang-release-AVX2/bin/unit-hyperscan' + } + } + } + } + stage("Clang-Release/AVX512") { + agent { label "x86" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-clang-release-AVX512', buildType: 'Release', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=yes -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-clang-release-AVX512/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-clang-release-AVX512/bin/unit-hyperscan' + } + } + } + } + stage("Clang-Release/FAT") { + agent { label "x86" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-clang-release-fat', buildType: 'Release', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=yes -DFAT_RUNTIME=yes', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]] + } + } + stage("Test") { + steps { + sh 'build-clang-release-fat/bin/unit-hyperscan' + } + } + } + } + stage("Clang-Debug/SSE") { + agent { label "x86" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-clang-debug-SSE', buildType: 'Debug', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=no -DBUILD_AVX512=no -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-clang-debug-SSE/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-clang-debug-SSE/bin/unit-hyperscan' + } + } + } + } + stage("Clang-Debug/AVX2") { + agent { label "x86" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-clang-debug-AVX2', buildType: 'Debug', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=no -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-clang-debug-AVX2/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-clang-debug-AVX2/bin/unit-hyperscan' + } + } + } + } + stage("Clang-Debug/AVX512") { + agent { label "x86" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-clang-debug-AVX512', buildType: 'Debug', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=yes -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-clang-debug-AVX512/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-clang-debug-AVX512/bin/unit-hyperscan' + } + } + } + } + stage("Clang-Debug/FAT") { + agent { label "x86" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-clang-debug-fat', buildType: 'Debug', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=yes -DFAT_RUNTIME=yes', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]] + } + } + stage("Test") { + steps { + sh 'build-clang-debug-fat/bin/unit-hyperscan' + } + } + } + } + stage("Clang-Release/ARM") { + agent { label "arm" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-clang-release-arm', buildType: 'Release', cleanBuild: true, cmakeArgs: '', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-clang-release-arm/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-clang-release-arm/bin/unit-hyperscan' + } + } + } + } + stage("Clang-Debug/ARM") { + agent { label "arm" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-debug-arm', buildType: 'Debug', cleanBuild: true, cmakeArgs: '', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-clang-debug-arm/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-clang-debug-arm/bin/unit-hyperscan' + } + } + } + } + stage("Clang-Release/Power") { + agent { label "power" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-clang-release-power', buildType: 'Release', cleanBuild: true, cmakeArgs: '', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-clang-release-power/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-clang-release-power/bin/unit-hyperscan' + } + } + } + } + stage("Clang-Debug/Power") { + agent { label "power" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-clang-debug-power', buildType: 'Debug', cleanBuild: true, cmakeArgs: '', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-clang-debug-power/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-clang-debug-power/bin/unit-hyperscan' + } + } + } + } + } } - - } - steps { - sh 'mkdir build-release-SSE && cmake -DCMAKE_BUILD_TYPE=Release -C build-release-SSE' - } } - - } -} \ No newline at end of file +} diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 2100799f..29c39b49 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -88,7 +88,7 @@ if (FAT_RUNTIME) set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} ${SKYLAKE_FLAG}") endif (BUILD_AVX512VBMI) elseif (BUILD_AVX2) - set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} -march=core-avx2 -mavx") + set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} -march=core-avx2 -mavx2") elseif () set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} -march=core-i7 -mssse3") endif () @@ -98,12 +98,12 @@ else (NOT FAT_RUNTIME) endif () if (ARCH_IA32 OR ARCH_X86_64) - # ensure we have the minimum of SSSE3 - call a SSSE3 intrinsic + # ensure we have the minimum of SSE4.2 - call a SSE4.2 intrinsic CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}> int main() { __m128i a = _mm_set1_epi8(1); (void)_mm_shuffle_epi8(a, a); -}" HAVE_SSSE3) +}" HAVE_SSE42) # now look for AVX2 CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}> @@ -157,8 +157,8 @@ else () endif () if (FAT_RUNTIME) - if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_SSSE3) - message(FATAL_ERROR "SSSE3 support required to build fat runtime") + if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_SSE42) + message(FATAL_ERROR "SSE4.2 support required to build fat runtime") endif () if ((ARCH_IA32 OR ARCH_X86_64) AND BUILD_AVX2 AND NOT HAVE_AVX2) message(FATAL_ERROR "AVX2 support required to build fat runtime") @@ -179,8 +179,8 @@ else (NOT FAT_RUNTIME) if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_AVX512VBMI) message(STATUS "Building without AVX512VBMI support") endif () - if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_SSSE3) - message(FATAL_ERROR "A minimum of SSSE3 compiler support is required") + if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_SSE42) + message(FATAL_ERROR "A minimum of SSE4.2 compiler support is required") endif () if ((ARCH_ARM32 OR ARCH_AARCH64) AND NOT HAVE_NEON) message(FATAL_ERROR "NEON support required for ARM support") diff --git a/cmake/platform.cmake b/cmake/platform.cmake index 2cdc3a6e..5a2b85b2 100644 --- a/cmake/platform.cmake +++ b/cmake/platform.cmake @@ -1,3 +1,8 @@ +# determine compiler +if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") + set(CMAKE_COMPILER_IS_CLANG TRUE) +endif() + # determine the target arch if (CROSS_COMPILE_AARCH64) @@ -10,7 +15,7 @@ else() CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_IA32) CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_A64)\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_AARCH64) CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_ARM)\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_ARM32) - CHECK_C_SOURCE_COMPILES("#if !defined(__PPC64__) && !defined(__LITTLE_ENDIAN__) && !defined(__VSX__)\n#error not ppc64el\n#endif\nint main(void) { return 0; }" ARCH_PPC64EL) + CHECK_C_SOURCE_COMPILES("#if !defined(__PPC64__) && !(defined(__LITTLE_ENDIAN__) && defined(__VSX__))\n#error not ppc64el\n#endif\nint main(void) { return 0; }" ARCH_PPC64EL) if (ARCH_X86_64 OR ARCH_AARCH64 OR ARCH_PPC64EL) set(ARCH_64_BIT TRUE) else() diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h index 4c68b485..902d3624 100644 --- a/src/util/arch/arm/simd_utils.h +++ b/src/util/arch/arm/simd_utils.h @@ -122,24 +122,252 @@ m128 sub_2x64(m128 a, m128 b) { return (m128) vsubq_u64((uint64x2_t)a, (uint64x2_t)b); } -static really_really_inline +static really_inline m128 lshift_m128(m128 a, unsigned b) { - return (m128) vshlq_n_u32((uint32x4_t)a, b); +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(b)) { + return (m128) vshlq_n_u32((uint32x4_t)a, b); + } +#endif +#define CASE_LSHIFT_m128(a, offset) case offset: return (m128)vshlq_n_u32((uint32x4_t)(a), (offset)); break; + switch (b) { + case 0: return a; break; + CASE_LSHIFT_m128(a, 1); + CASE_LSHIFT_m128(a, 2); + CASE_LSHIFT_m128(a, 3); + CASE_LSHIFT_m128(a, 4); + CASE_LSHIFT_m128(a, 5); + CASE_LSHIFT_m128(a, 6); + CASE_LSHIFT_m128(a, 7); + CASE_LSHIFT_m128(a, 8); + CASE_LSHIFT_m128(a, 9); + CASE_LSHIFT_m128(a, 10); + CASE_LSHIFT_m128(a, 11); + CASE_LSHIFT_m128(a, 12); + CASE_LSHIFT_m128(a, 13); + CASE_LSHIFT_m128(a, 14); + CASE_LSHIFT_m128(a, 15); + CASE_LSHIFT_m128(a, 16); + CASE_LSHIFT_m128(a, 17); + CASE_LSHIFT_m128(a, 18); + CASE_LSHIFT_m128(a, 19); + CASE_LSHIFT_m128(a, 20); + CASE_LSHIFT_m128(a, 21); + CASE_LSHIFT_m128(a, 22); + CASE_LSHIFT_m128(a, 23); + CASE_LSHIFT_m128(a, 24); + CASE_LSHIFT_m128(a, 25); + CASE_LSHIFT_m128(a, 26); + CASE_LSHIFT_m128(a, 27); + CASE_LSHIFT_m128(a, 28); + CASE_LSHIFT_m128(a, 29); + CASE_LSHIFT_m128(a, 30); + CASE_LSHIFT_m128(a, 31); + default: return zeroes128(); break; + } +#undef CASE_LSHIFT_m128 } static really_really_inline m128 rshift_m128(m128 a, unsigned b) { - return (m128) vshrq_n_u32((uint32x4_t)a, b); +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(b)) { + return (m128) vshrq_n_u32((uint32x4_t)a, b); + } +#endif +#define CASE_RSHIFT_m128(a, offset) case offset: return (m128)vshrq_n_u32((uint32x4_t)(a), (offset)); break; + switch (b) { + case 0: return a; break; + CASE_RSHIFT_m128(a, 1); + CASE_RSHIFT_m128(a, 2); + CASE_RSHIFT_m128(a, 3); + CASE_RSHIFT_m128(a, 4); + CASE_RSHIFT_m128(a, 5); + CASE_RSHIFT_m128(a, 6); + CASE_RSHIFT_m128(a, 7); + CASE_RSHIFT_m128(a, 8); + CASE_RSHIFT_m128(a, 9); + CASE_RSHIFT_m128(a, 10); + CASE_RSHIFT_m128(a, 11); + CASE_RSHIFT_m128(a, 12); + CASE_RSHIFT_m128(a, 13); + CASE_RSHIFT_m128(a, 14); + CASE_RSHIFT_m128(a, 15); + CASE_RSHIFT_m128(a, 16); + CASE_RSHIFT_m128(a, 17); + CASE_RSHIFT_m128(a, 18); + CASE_RSHIFT_m128(a, 19); + CASE_RSHIFT_m128(a, 20); + CASE_RSHIFT_m128(a, 21); + CASE_RSHIFT_m128(a, 22); + CASE_RSHIFT_m128(a, 23); + CASE_RSHIFT_m128(a, 24); + CASE_RSHIFT_m128(a, 25); + CASE_RSHIFT_m128(a, 26); + CASE_RSHIFT_m128(a, 27); + CASE_RSHIFT_m128(a, 28); + CASE_RSHIFT_m128(a, 29); + CASE_RSHIFT_m128(a, 30); + CASE_RSHIFT_m128(a, 31); + default: return zeroes128(); break; + } +#undef CASE_RSHIFT_m128 } static really_really_inline m128 lshift64_m128(m128 a, unsigned b) { - return (m128) vshlq_n_u64((uint64x2_t)a, b); +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(b)) { + return (m128) vshlq_n_u64((uint64x2_t)a, b); + } +#endif +#define CASE_LSHIFT64_m128(a, offset) case offset: return (m128)vshlq_n_u64((uint64x2_t)(a), (offset)); break; + switch (b) { + case 0: return a; break; + CASE_LSHIFT64_m128(a, 1); + CASE_LSHIFT64_m128(a, 2); + CASE_LSHIFT64_m128(a, 3); + CASE_LSHIFT64_m128(a, 4); + CASE_LSHIFT64_m128(a, 5); + CASE_LSHIFT64_m128(a, 6); + CASE_LSHIFT64_m128(a, 7); + CASE_LSHIFT64_m128(a, 8); + CASE_LSHIFT64_m128(a, 9); + CASE_LSHIFT64_m128(a, 10); + CASE_LSHIFT64_m128(a, 11); + CASE_LSHIFT64_m128(a, 12); + CASE_LSHIFT64_m128(a, 13); + CASE_LSHIFT64_m128(a, 14); + CASE_LSHIFT64_m128(a, 15); + CASE_LSHIFT64_m128(a, 16); + CASE_LSHIFT64_m128(a, 17); + CASE_LSHIFT64_m128(a, 18); + CASE_LSHIFT64_m128(a, 19); + CASE_LSHIFT64_m128(a, 20); + CASE_LSHIFT64_m128(a, 21); + CASE_LSHIFT64_m128(a, 22); + CASE_LSHIFT64_m128(a, 23); + CASE_LSHIFT64_m128(a, 24); + CASE_LSHIFT64_m128(a, 25); + CASE_LSHIFT64_m128(a, 26); + CASE_LSHIFT64_m128(a, 27); + CASE_LSHIFT64_m128(a, 28); + CASE_LSHIFT64_m128(a, 29); + CASE_LSHIFT64_m128(a, 30); + CASE_LSHIFT64_m128(a, 31); + CASE_LSHIFT64_m128(a, 32); + CASE_LSHIFT64_m128(a, 33); + CASE_LSHIFT64_m128(a, 34); + CASE_LSHIFT64_m128(a, 35); + CASE_LSHIFT64_m128(a, 36); + CASE_LSHIFT64_m128(a, 37); + CASE_LSHIFT64_m128(a, 38); + CASE_LSHIFT64_m128(a, 39); + CASE_LSHIFT64_m128(a, 40); + CASE_LSHIFT64_m128(a, 41); + CASE_LSHIFT64_m128(a, 42); + CASE_LSHIFT64_m128(a, 43); + CASE_LSHIFT64_m128(a, 44); + CASE_LSHIFT64_m128(a, 45); + CASE_LSHIFT64_m128(a, 46); + CASE_LSHIFT64_m128(a, 47); + CASE_LSHIFT64_m128(a, 48); + CASE_LSHIFT64_m128(a, 49); + CASE_LSHIFT64_m128(a, 50); + CASE_LSHIFT64_m128(a, 51); + CASE_LSHIFT64_m128(a, 52); + CASE_LSHIFT64_m128(a, 53); + CASE_LSHIFT64_m128(a, 54); + CASE_LSHIFT64_m128(a, 55); + CASE_LSHIFT64_m128(a, 56); + CASE_LSHIFT64_m128(a, 57); + CASE_LSHIFT64_m128(a, 58); + CASE_LSHIFT64_m128(a, 59); + CASE_LSHIFT64_m128(a, 60); + CASE_LSHIFT64_m128(a, 61); + CASE_LSHIFT64_m128(a, 62); + CASE_LSHIFT64_m128(a, 63); + default: return zeroes128(); break; + } +#undef CASE_LSHIFT64_m128 } static really_really_inline m128 rshift64_m128(m128 a, unsigned b) { - return (m128) vshrq_n_u64((uint64x2_t)a, b); +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(b)) { + return (m128) vshrq_n_u64((uint64x2_t)a, b); + } +#endif +#define CASE_RSHIFT64_m128(a, offset) case offset: return (m128)vshrq_n_u64((uint64x2_t)(a), (offset)); break; + switch (b) { + case 0: return a; break; + CASE_RSHIFT64_m128(a, 1); + CASE_RSHIFT64_m128(a, 2); + CASE_RSHIFT64_m128(a, 3); + CASE_RSHIFT64_m128(a, 4); + CASE_RSHIFT64_m128(a, 5); + CASE_RSHIFT64_m128(a, 6); + CASE_RSHIFT64_m128(a, 7); + CASE_RSHIFT64_m128(a, 8); + CASE_RSHIFT64_m128(a, 9); + CASE_RSHIFT64_m128(a, 10); + CASE_RSHIFT64_m128(a, 11); + CASE_RSHIFT64_m128(a, 12); + CASE_RSHIFT64_m128(a, 13); + CASE_RSHIFT64_m128(a, 14); + CASE_RSHIFT64_m128(a, 15); + CASE_RSHIFT64_m128(a, 16); + CASE_RSHIFT64_m128(a, 17); + CASE_RSHIFT64_m128(a, 18); + CASE_RSHIFT64_m128(a, 19); + CASE_RSHIFT64_m128(a, 20); + CASE_RSHIFT64_m128(a, 21); + CASE_RSHIFT64_m128(a, 22); + CASE_RSHIFT64_m128(a, 23); + CASE_RSHIFT64_m128(a, 24); + CASE_RSHIFT64_m128(a, 25); + CASE_RSHIFT64_m128(a, 26); + CASE_RSHIFT64_m128(a, 27); + CASE_RSHIFT64_m128(a, 28); + CASE_RSHIFT64_m128(a, 29); + CASE_RSHIFT64_m128(a, 30); + CASE_RSHIFT64_m128(a, 31); + CASE_RSHIFT64_m128(a, 32); + CASE_RSHIFT64_m128(a, 33); + CASE_RSHIFT64_m128(a, 34); + CASE_RSHIFT64_m128(a, 35); + CASE_RSHIFT64_m128(a, 36); + CASE_RSHIFT64_m128(a, 37); + CASE_RSHIFT64_m128(a, 38); + CASE_RSHIFT64_m128(a, 39); + CASE_RSHIFT64_m128(a, 40); + CASE_RSHIFT64_m128(a, 41); + CASE_RSHIFT64_m128(a, 42); + CASE_RSHIFT64_m128(a, 43); + CASE_RSHIFT64_m128(a, 44); + CASE_RSHIFT64_m128(a, 45); + CASE_RSHIFT64_m128(a, 46); + CASE_RSHIFT64_m128(a, 47); + CASE_RSHIFT64_m128(a, 48); + CASE_RSHIFT64_m128(a, 49); + CASE_RSHIFT64_m128(a, 50); + CASE_RSHIFT64_m128(a, 51); + CASE_RSHIFT64_m128(a, 52); + CASE_RSHIFT64_m128(a, 53); + CASE_RSHIFT64_m128(a, 54); + CASE_RSHIFT64_m128(a, 55); + CASE_RSHIFT64_m128(a, 56); + CASE_RSHIFT64_m128(a, 57); + CASE_RSHIFT64_m128(a, 58); + CASE_RSHIFT64_m128(a, 59); + CASE_RSHIFT64_m128(a, 60); + CASE_RSHIFT64_m128(a, 61); + CASE_RSHIFT64_m128(a, 62); + CASE_RSHIFT64_m128(a, 63); + default: return zeroes128(); break; + } +#undef CASE_RSHIFT64_m128 } static really_inline m128 eq128(m128 a, m128 b) { @@ -191,9 +419,11 @@ m128 load_m128_from_u64a(const u64a *p) { } static really_inline u32 extract32from128(const m128 in, unsigned imm) { -#if defined(HS_OPTIMIZE) - return vgetq_lane_u32((uint32x4_t) in, imm); -#else +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(imm)) { + return vgetq_lane_u32((uint32x4_t) in, imm); + } +#endif switch (imm) { case 0: return vgetq_lane_u32((uint32x4_t) in, 0); @@ -211,13 +441,14 @@ static really_inline u32 extract32from128(const m128 in, unsigned imm) { return 0; break; } -#endif } static really_inline u64a extract64from128(const m128 in, unsigned imm) { -#if defined(HS_OPTIMIZE) - return vgetq_lane_u64((uint64x2_t) in, imm); -#else +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(imm)) { + return vgetq_lane_u64((uint64x2_t) in, imm); + } +#endif switch (imm) { case 0: return vgetq_lane_u64((uint64x2_t) in, 0); @@ -229,7 +460,6 @@ static really_inline u64a extract64from128(const m128 in, unsigned imm) { return 0; break; } -#endif } static really_inline m128 low64from128(const m128 in) { diff --git a/src/util/arch/ppc64el/simd_types.h b/src/util/arch/ppc64el/simd_types.h index 21dae5cb..8a5b0e25 100644 --- a/src/util/arch/ppc64el/simd_types.h +++ b/src/util/arch/ppc64el/simd_types.h @@ -30,7 +30,7 @@ #define ARCH_PPC64EL_SIMD_TYPES_H #if !defined(m128) && defined(HAVE_VSX) -typedef __vector int32_t m128; +typedef __vector int m128; #endif #endif /* ARCH_PPC64EL_SIMD_TYPES_H */ diff --git a/src/util/arch/ppc64el/simd_utils.h b/src/util/arch/ppc64el/simd_utils.h index 137fc94f..d046ed47 100644 --- a/src/util/arch/ppc64el/simd_utils.h +++ b/src/util/arch/ppc64el/simd_utils.h @@ -43,6 +43,18 @@ #include // for memcpy +typedef __vector unsigned long long int uint64x2_t; +typedef __vector signed long long int int64x2_t; +typedef __vector unsigned int uint32x4_t; +typedef __vector signed int int32x4_t; +typedef __vector unsigned short int uint16x8_t; +typedef __vector signed short int int16x8_t; +typedef __vector unsigned char uint8x16_t; +typedef __vector signed char int8x16_t; + +typedef unsigned long long int ulong64_t; +typedef signed long long int long64_t; +/* typedef __vector uint64_t uint64x2_t; typedef __vector int64_t int64x2_t; typedef __vector uint32_t uint32x4_t; @@ -50,7 +62,7 @@ typedef __vector int32_t int32x4_t; typedef __vector uint16_t uint16x8_t; typedef __vector int16_t int16x8_t; typedef __vector uint8_t uint8x16_t; -typedef __vector int8_t int8x16_t; +typedef __vector int8_t int8x16_t;*/ #define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0 @@ -182,13 +194,13 @@ m128 rshift_m128(m128 a, unsigned b) { static really_really_inline m128 lshift64_m128(m128 a, unsigned b) { - uint64x2_t shift_indices = vec_splats((uint64_t)b); + uint64x2_t shift_indices = vec_splats((ulong64_t)b); return (m128) vec_sl((int64x2_t)a, shift_indices); } static really_really_inline m128 rshift64_m128(m128 a, unsigned b) { - uint64x2_t shift_indices = vec_splats((uint64_t)b); + uint64x2_t shift_indices = vec_splats((ulong64_t)b); return (m128) vec_sr((int64x2_t)a, shift_indices); } @@ -213,11 +225,11 @@ static really_inline u32 movemask128(m128 a) { uint32x4_t s3 = vec_or((uint32x4_t)ss2, res_and2); uint64x2_t ss3 = vec_sr((uint64x2_t)s3, (uint64x2_t)vec_splats(28)); - uint64x2_t res_and3 = vec_and((uint64x2_t)s3, vec_splats((uint64_t)0xff)); + uint64x2_t res_and3 = vec_and((uint64x2_t)s3, vec_splats((ulong64_t)0xff)); uint64x2_t s4 = vec_or((uint64x2_t)ss3, res_and3); uint64x2_t ss4 = vec_sld((uint64x2_t)vec_splats(0), s4, 9); - uint64x2_t res_and4 = vec_and((uint64x2_t)s4, vec_splats((uint64_t)0xff)); + uint64x2_t res_and4 = vec_and((uint64x2_t)s4, vec_splats((ulong64_t)0xff)); uint64x2_t s5 = vec_or((uint64x2_t)ss4, res_and4); return s5[0]; diff --git a/src/util/arch/x86/simd_types.h b/src/util/arch/x86/simd_types.h index c04e8dab..e1642404 100644 --- a/src/util/arch/x86/simd_types.h +++ b/src/util/arch/x86/simd_types.h @@ -30,7 +30,7 @@ #ifndef SIMD_TYPES_X86_H #define SIMD_TYPES_X86_H -#if !defined(m128) && defined(HAVE_SSE2) +#if !defined(m128) && defined(HAVE_SSE42) typedef __m128i m128; #endif diff --git a/src/util/simd_types.h b/src/util/simd_types.h index 0deff7e5..4f0fd1a9 100644 --- a/src/util/simd_types.h +++ b/src/util/simd_types.h @@ -51,6 +51,7 @@ typedef struct ALIGN_AVX_DIRECTIVE {m128 lo; m128 hi;} m256; #endif typedef struct {m128 lo; m128 mid; m128 hi;} m384; + #if !defined(m512) && !defined(HAVE_SIMD_512_BITS) typedef struct ALIGN_ATTR(64) {m256 lo; m256 hi;} m512; #endif diff --git a/src/util/supervector/arch/arm/impl.cpp b/src/util/supervector/arch/arm/impl.cpp index 980f0b39..ff1149a9 100644 --- a/src/util/supervector/arch/arm/impl.cpp +++ b/src/util/supervector/arch/arm/impl.cpp @@ -45,112 +45,112 @@ really_inline SuperVector<16>::SuperVector(typename base_type::type const v) template<> template<> -really_inline SuperVector<16>::SuperVector(int8x16_t other) +really_inline SuperVector<16>::SuperVector(int8x16_t other) { u.s8x16[0] = other; } template<> template<> -really_inline SuperVector<16>::SuperVector(uint8x16_t other) +really_inline SuperVector<16>::SuperVector(uint8x16_t other) { u.u8x16[0] = other; } template<> template<> -really_inline SuperVector<16>::SuperVector(int16x8_t other) +really_inline SuperVector<16>::SuperVector(int16x8_t other) { u.s16x8[0] = other; } template<> template<> -really_inline SuperVector<16>::SuperVector(uint16x8_t other) +really_inline SuperVector<16>::SuperVector(uint16x8_t other) { u.u16x8[0] = other; } template<> template<> -really_inline SuperVector<16>::SuperVector(int32x4_t other) +really_inline SuperVector<16>::SuperVector(int32x4_t other) { u.s32x4[0] = other; } template<> template<> -really_inline SuperVector<16>::SuperVector(uint32x4_t other) +really_inline SuperVector<16>::SuperVector(uint32x4_t other) { u.u32x4[0] = other; } template<> template<> -really_inline SuperVector<16>::SuperVector(int64x2_t other) +really_inline SuperVector<16>::SuperVector(int64x2_t other) { u.s64x2[0] = other; } template<> template<> -really_inline SuperVector<16>::SuperVector(uint64x2_t other) +really_inline SuperVector<16>::SuperVector(uint64x2_t other) { u.u64x2[0] = other; } template<> template<> -really_inline SuperVector<16>::SuperVector(int8_t const other) +really_inline SuperVector<16>::SuperVector(int8_t const other) { u.s8x16[0] = vdupq_n_s8(other); } template<> template<> -really_inline SuperVector<16>::SuperVector(uint8_t const other) +really_inline SuperVector<16>::SuperVector(uint8_t const other) { u.u8x16[0] = vdupq_n_u8(other); } template<> template<> -really_inline SuperVector<16>::SuperVector(int16_t const other) +really_inline SuperVector<16>::SuperVector(int16_t const other) { u.s16x8[0] = vdupq_n_s16(other); } template<> template<> -really_inline SuperVector<16>::SuperVector(uint16_t const other) +really_inline SuperVector<16>::SuperVector(uint16_t const other) { u.u16x8[0] = vdupq_n_u16(other); } template<> template<> -really_inline SuperVector<16>::SuperVector(int32_t const other) +really_inline SuperVector<16>::SuperVector(int32_t const other) { u.s32x4[0] = vdupq_n_s32(other); } template<> template<> -really_inline SuperVector<16>::SuperVector(uint32_t const other) +really_inline SuperVector<16>::SuperVector(uint32_t const other) { u.u32x4[0] = vdupq_n_u32(other); } template<> template<> -really_inline SuperVector<16>::SuperVector(int64_t const other) +really_inline SuperVector<16>::SuperVector(int64_t const other) { u.s64x2[0] = vdupq_n_s64(other); } template<> template<> -really_inline SuperVector<16>::SuperVector(uint64_t const other) +really_inline SuperVector<16>::SuperVector(uint64_t const other) { u.u64x2[0] = vdupq_n_u64(other); } @@ -376,7 +376,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_8 (uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u8(u.u8x16[0], n)}; }); + Unroller<1, 8>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u8(v->u.u8x16[0], n)}; }); return result; } @@ -386,7 +386,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u16(u.u16x8[0], n)}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u16(v->u.u16x8[0], n)}; }); return result; } @@ -394,9 +394,9 @@ template <> really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const { if (N == 0) return *this; - if (N == 16) return Zeroes(); + if (N == 32) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u32(u.u32x4[0], n)}; }); + Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u32(v->u.u32x4[0], n)}; }); return result; } @@ -404,9 +404,9 @@ template <> really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const { if (N == 0) return *this; - if (N == 16) return Zeroes(); + if (N == 64) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u64(u.u64x2[0], n)}; }); + Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u64(v->u.u64x2[0], n)}; }); return result; } @@ -416,7 +416,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_u8(vdupq_n_u8(0), u.u8x16[0], 16 - n)}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_u8(vdupq_n_u8(0), v->u.u8x16[0], 16 - n)}; }); return result; } @@ -430,9 +430,9 @@ template <> really_inline SuperVector<16> SuperVector<16>::vshr_8 (uint8_t const N) const { if (N == 0) return *this; - if (N == 16) return Zeroes(); + if (N == 8) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u8(u.u8x16[0], n)}; }); + Unroller<1, 8>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u8(v->u.u8x16[0], n)}; }); return result; } @@ -442,7 +442,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u16(u.u16x8[0], n)}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u16(v->u.u16x8[0], n)}; }); return result; } @@ -450,9 +450,9 @@ template <> really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const { if (N == 0) return *this; - if (N == 16) return Zeroes(); + if (N == 32) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u32(u.u32x4[0], n)}; }); + Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u32(v->u.u32x4[0], n)}; }); return result; } @@ -460,9 +460,9 @@ template <> really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const { if (N == 0) return *this; - if (N == 16) return Zeroes(); + if (N == 64) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u64(u.u64x2[0], n)}; }); + Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u64(v->u.u64x2[0], n)}; }); return result; } @@ -472,7 +472,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_u8(u.u8x16[0], vdupq_n_u8(0), n)}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_u8(v->u.u8x16[0], vdupq_n_u8(0), n)}; }); return result; } diff --git a/src/util/supervector/arch/ppc64el/impl.cpp b/src/util/supervector/arch/ppc64el/impl.cpp index e054e02e..109b8d5e 100644 --- a/src/util/supervector/arch/ppc64el/impl.cpp +++ b/src/util/supervector/arch/ppc64el/impl.cpp @@ -39,16 +39,6 @@ #include "util/supervector/supervector.hpp" #include - -typedef __vector uint64_t uint64x2_t; -typedef __vector int64_t int64x2_t; -typedef __vector uint32_t uint32x4_t; -typedef __vector int32_t int32x4_t; -typedef __vector uint16_t uint16x8_t; -typedef __vector int16_t int16x8_t; -typedef __vector uint8_t uint8x16_t; -typedef __vector int8_t int8x16_t; - // 128-bit Powerpc64le implementation template<> @@ -65,58 +55,58 @@ really_inline SuperVector<16>::SuperVector(typename base_type::type const v) template<> template<> -really_inline SuperVector<16>::SuperVector(int8_t const other) +really_inline SuperVector<16>::SuperVector(int8_t const other) { u.v128[0] = (m128) vec_splats(other); } template<> template<> -really_inline SuperVector<16>::SuperVector(uint8_t const other) +really_inline SuperVector<16>::SuperVector(uint8_t const other) { u.v128[0] = (m128) vec_splats(static_cast(other)); } template<> template<> -really_inline SuperVector<16>::SuperVector(int16_t const other) +really_inline SuperVector<16>::SuperVector(int16_t const other) { u.v128[0] = (m128) vec_splats(other); } template<> template<> -really_inline SuperVector<16>::SuperVector(uint16_t const other) +really_inline SuperVector<16>::SuperVector(uint16_t const other) { u.v128[0] = (m128) vec_splats(static_cast(other)); } template<> template<> -really_inline SuperVector<16>::SuperVector(int32_t const other) +really_inline SuperVector<16>::SuperVector(int32_t const other) { u.v128[0] = (m128) vec_splats(other); } template<> template<> -really_inline SuperVector<16>::SuperVector(uint32_t const other) +really_inline SuperVector<16>::SuperVector(uint32_t const other) { u.v128[0] = (m128) vec_splats(static_cast(other)); } template<> template<> -really_inline SuperVector<16>::SuperVector(int64_t const other) +really_inline SuperVector<16>::SuperVector(int64_t const other) { - u.v128[0] = (m128) vec_splats(other); + u.v128[0] = (m128) vec_splats(static_cast(other)); } template<> template<> -really_inline SuperVector<16>::SuperVector(uint64_t const other) +really_inline SuperVector<16>::SuperVector(uint64_t const other) { - u.v128[0] = (m128) vec_splats(static_cast(other)); + u.v128[0] = (m128) vec_splats(static_cast(other)); } // Constants @@ -229,11 +219,11 @@ really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask( uint32x4_t s3 = vec_or((uint32x4_t)ss2, res_and2); uint64x2_t ss3 = vec_sr((uint64x2_t)s3, (uint64x2_t)vec_splats(28)); - uint64x2_t res_and3 = vec_and((uint64x2_t)s3, vec_splats((uint64_t)0xff)); + uint64x2_t res_and3 = vec_and((uint64x2_t)s3, vec_splats((ulong64_t)0xff)); uint64x2_t s4 = vec_or((uint64x2_t)ss3, res_and3); uint64x2_t ss4 = vec_sld((uint64x2_t) vec_splats(0), s4, 9); - uint64x2_t res_and4 = vec_and((uint64x2_t)s4, vec_splats((uint64_t)0xff)); + uint64x2_t res_and4 = vec_and((uint64x2_t)s4, vec_splats((ulong64_t)0xff)); uint64x2_t s5 = vec_or((uint64x2_t)ss4, res_and4); return s5[0]; @@ -271,7 +261,7 @@ template <> template really_inline SuperVector<16> SuperVector<16>::vshl_64_imm() const { - return { (m128) vec_sl(u.s64x2[0], vec_splats((uint64_t)N)) }; + return { (m128) vec_sl(u.s64x2[0], vec_splats((ulong64_t)N)) }; } template <> @@ -313,7 +303,7 @@ template <> template really_inline SuperVector<16> SuperVector<16>::vshr_64_imm() const { - return { (m128) vec_sr(u.s64x2[0], vec_splats((uint64_t)N)) }; + return { (m128) vec_sr(u.s64x2[0], vec_splats((ulong64_t)N)) }; } template <> @@ -352,7 +342,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_8 (uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(u.s8x16[0], vec_splats((uint8_t)n))}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s8x16[0], vec_splats((uint8_t)n))}; }); return result; } @@ -362,7 +352,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const UNUSED N) if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(u.s16x8[0], vec_splats((uint16_t)n))}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s16x8[0], vec_splats((uint16_t)n))}; }); return result; } @@ -372,7 +362,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(u.s32x4[0], vec_splats((uint32_t)n))}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s32x4[0], vec_splats((uint32_t)n))}; }); return result; } @@ -382,7 +372,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(u.s64x2[0], vec_splats((uint64_t)n))}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s64x2[0], vec_splats((ulong64_t)n))}; }); return result; } @@ -392,7 +382,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sld(u.s8x16[0], (int8x16_t)vec_splat_s8(0), n)}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sld(v->u.s8x16[0], (int8x16_t)vec_splat_s8(0), n)}; }); return result; } @@ -408,7 +398,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_8 (uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(u.s8x16[0], vec_splats((uint8_t)n))}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s8x16[0], vec_splats((uint8_t)n))}; }); return result; } @@ -418,7 +408,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(u.s16x8[0], vec_splats((uint16_t)n))}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s16x8[0], vec_splats((uint16_t)n))}; }); return result; } @@ -428,7 +418,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(u.s32x4[0], vec_splats((uint32_t)n))}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s32x4[0], vec_splats((uint32_t)n))}; }); return result; } @@ -438,7 +428,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(u.s64x2[0], vec_splats((uint64_t)n))}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s64x2[0], vec_splats((ulong64_t)n))}; }); return result; } @@ -448,7 +438,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const UNUSED N) if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sld((int8x16_t)vec_splat_u8(0), u.s8x16[0], 16 - n)}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sld((int8x16_t)vec_splat_u8(0), v->u.s8x16[0], 16 - n)}; }); return result; } @@ -523,14 +513,14 @@ really_inline SuperVector<16> SuperVector<16>::Ones_vshl(uint8_t const N) template <> really_inline SuperVector<16> SuperVector<16>::loadu(void const *ptr) { - return (m128) vec_xl(0, (const int64_t*)ptr); + return (m128) vec_xl(0, (const long64_t*)ptr); } template <> really_inline SuperVector<16> SuperVector<16>::load(void const *ptr) { assert(ISALIGNED_N(ptr, alignof(SuperVector::size))); - return (m128) vec_xl(0, (const int64_t*)ptr); + return (m128) vec_xl(0, (const long64_t*)ptr); } template <> diff --git a/src/util/supervector/arch/ppc64el/types.hpp b/src/util/supervector/arch/ppc64el/types.hpp index dbd863f4..bdc6608e 100644 --- a/src/util/supervector/arch/ppc64el/types.hpp +++ b/src/util/supervector/arch/ppc64el/types.hpp @@ -27,6 +27,18 @@ * POSSIBILITY OF SUCH DAMAGE. */ +typedef __vector unsigned long long int uint64x2_t; +typedef __vector signed long long int int64x2_t; +typedef __vector unsigned int uint32x4_t; +typedef __vector signed int int32x4_t; +typedef __vector unsigned short int uint16x8_t; +typedef __vector signed short int int16x8_t; +typedef __vector unsigned char uint8x16_t; +typedef __vector signed char int8x16_t; + +typedef unsigned long long int ulong64_t; +typedef signed long long int long64_t; + #if !defined(m128) && defined(HAVE_VSX) -typedef __vector int32_t m128; +typedef __vector int m128; #endif diff --git a/src/util/supervector/arch/x86/impl.cpp b/src/util/supervector/arch/x86/impl.cpp index b7686220..157f1dc4 100644 --- a/src/util/supervector/arch/x86/impl.cpp +++ b/src/util/supervector/arch/x86/impl.cpp @@ -55,56 +55,56 @@ really_inline SuperVector<16>::SuperVector(typename base_type::type const v) template<> template<> -really_inline SuperVector<16>::SuperVector(int8_t const other) +really_inline SuperVector<16>::SuperVector(int8_t const other) { u.v128[0] = _mm_set1_epi8(other); } template<> template<> -really_inline SuperVector<16>::SuperVector(uint8_t const other) +really_inline SuperVector<16>::SuperVector(uint8_t const other) { u.v128[0] = _mm_set1_epi8(static_cast(other)); } template<> template<> -really_inline SuperVector<16>::SuperVector(int16_t const other) +really_inline SuperVector<16>::SuperVector(int16_t const other) { u.v128[0] = _mm_set1_epi16(other); } template<> template<> -really_inline SuperVector<16>::SuperVector(uint16_t const other) +really_inline SuperVector<16>::SuperVector(uint16_t const other) { u.v128[0] = _mm_set1_epi16(static_cast(other)); } template<> template<> -really_inline SuperVector<16>::SuperVector(int32_t const other) +really_inline SuperVector<16>::SuperVector(int32_t const other) { u.v128[0] = _mm_set1_epi32(other); } template<> template<> -really_inline SuperVector<16>::SuperVector(uint32_t const other) +really_inline SuperVector<16>::SuperVector(uint32_t const other) { u.v128[0] = _mm_set1_epi32(static_cast(other)); } template<> template<> -really_inline SuperVector<16>::SuperVector(int64_t const other) +really_inline SuperVector<16>::SuperVector(int64_t const other) { u.v128[0] = _mm_set1_epi64x(other); } template<> template<> -really_inline SuperVector<16>::SuperVector(uint64_t const other) +really_inline SuperVector<16>::SuperVector(uint64_t const other) { u.v128[0] = _mm_set1_epi64x(static_cast(other)); } @@ -608,56 +608,56 @@ really_inline SuperVector<32>::SuperVector(SuperVector<16> const lo, SuperVector template<> template<> -really_inline SuperVector<32>::SuperVector(int8_t const other) +really_inline SuperVector<32>::SuperVector(int8_t const other) { u.v256[0] = _mm256_set1_epi8(other); } template<> template<> -really_inline SuperVector<32>::SuperVector(uint8_t const other) +really_inline SuperVector<32>::SuperVector(uint8_t const other) { u.v256[0] = _mm256_set1_epi8(static_cast(other)); } template<> template<> -really_inline SuperVector<32>::SuperVector(int16_t const other) +really_inline SuperVector<32>::SuperVector(int16_t const other) { u.v256[0] = _mm256_set1_epi16(other); } template<> template<> -really_inline SuperVector<32>::SuperVector(uint16_t const other) +really_inline SuperVector<32>::SuperVector(uint16_t const other) { u.v256[0] = _mm256_set1_epi16(static_cast(other)); } template<> template<> -really_inline SuperVector<32>::SuperVector(int32_t const other) +really_inline SuperVector<32>::SuperVector(int32_t const other) { u.v256[0] = _mm256_set1_epi32(other); } template<> template<> -really_inline SuperVector<32>::SuperVector(uint32_t const other) +really_inline SuperVector<32>::SuperVector(uint32_t const other) { u.v256[0] = _mm256_set1_epi32(static_cast(other)); } template<> template<> -really_inline SuperVector<32>::SuperVector(int64_t const other) +really_inline SuperVector<32>::SuperVector(int64_t const other) { u.v256[0] = _mm256_set1_epi64x(other); } template<> template<> -really_inline SuperVector<32>::SuperVector(uint64_t const other) +really_inline SuperVector<32>::SuperVector(uint64_t const other) { u.v256[0] = _mm256_set1_epi64x(static_cast(other)); } @@ -804,7 +804,7 @@ really_inline SuperVector<32> SuperVector<32>::vshl_128_imm() const template <> template -really_inline SuperVector<16> SuperVector<32>::vshl_256_imm() const +really_inline SuperVector<32> SuperVector<32>::vshl_256_imm() const { if (N == 0) return *this; if (N == 16) return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0))}; @@ -950,11 +950,11 @@ really_inline SuperVector<32> SuperVector<32>::vshl_256(uint8_t const N) const SuperVector result; Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; - if (N == n) result = {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 16 - n)};; + if (N == n) result = {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(v->u.v256[0], v->u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 16 - n)};; }); Unroller<17, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; - if (N == n) result = {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), n - 16)}; + if (N == n) result = {_mm256_slli_si256(_mm256_permute2x128_si256(v->u.v256[0], v->u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), n - 16)}; }); return result; } @@ -1240,56 +1240,56 @@ really_inline SuperVector<64>::SuperVector(m128 const v) template<> template<> -really_inline SuperVector<64>::SuperVector(int8_t const o) +really_inline SuperVector<64>::SuperVector(int8_t const o) { u.v512[0] = _mm512_set1_epi8(o); } template<> template<> -really_inline SuperVector<64>::SuperVector(uint8_t const o) +really_inline SuperVector<64>::SuperVector(uint8_t const o) { u.v512[0] = _mm512_set1_epi8(static_cast(o)); } template<> template<> -really_inline SuperVector<64>::SuperVector(int16_t const o) +really_inline SuperVector<64>::SuperVector(int16_t const o) { u.v512[0] = _mm512_set1_epi16(o); } template<> template<> -really_inline SuperVector<64>::SuperVector(uint16_t const o) +really_inline SuperVector<64>::SuperVector(uint16_t const o) { u.v512[0] = _mm512_set1_epi16(static_cast(o)); } template<> template<> -really_inline SuperVector<64>::SuperVector(int32_t const o) +really_inline SuperVector<64>::SuperVector(int32_t const o) { u.v512[0] = _mm512_set1_epi32(o); } template<> template<> -really_inline SuperVector<64>::SuperVector(uint32_t const o) +really_inline SuperVector<64>::SuperVector(uint32_t const o) { u.v512[0] = _mm512_set1_epi32(static_cast(o)); } template<> template<> -really_inline SuperVector<64>::SuperVector(int64_t const o) +really_inline SuperVector<64>::SuperVector(int64_t const o) { u.v512[0] = _mm512_set1_epi64(o); } template<> template<> -really_inline SuperVector<64>::SuperVector(uint64_t const o) +really_inline SuperVector<64>::SuperVector(uint64_t const o) { u.v512[0] = _mm512_set1_epi64(static_cast(o)); } diff --git a/src/util/supervector/supervector.hpp b/src/util/supervector/supervector.hpp index 737412f6..f0ddf63c 100644 --- a/src/util/supervector/supervector.hpp +++ b/src/util/supervector/supervector.hpp @@ -165,7 +165,7 @@ public: typename BaseVector<32>::type ALIGN_ATTR(BaseVector<32>::size) v256[SIZE / BaseVector<32>::size]; typename BaseVector<64>::type ALIGN_ATTR(BaseVector<64>::size) v512[SIZE / BaseVector<64>::size]; -#if defined(ARCH_ARM32) || defined(ARCH_AARCH64) +#if defined(ARCH_ARM32) || defined(ARCH_AARCH64) || defined(ARCH_PPC64EL) uint64x2_t ALIGN_ATTR(BaseVector<16>::size) u64x2[SIZE / BaseVector<16>::size]; int64x2_t ALIGN_ATTR(BaseVector<16>::size) s64x2[SIZE / BaseVector<16>::size]; uint32x4_t ALIGN_ATTR(BaseVector<16>::size) u32x4[SIZE / BaseVector<16>::size]; @@ -176,17 +176,6 @@ public: int8x16_t ALIGN_ATTR(BaseVector<16>::size) s8x16[SIZE / BaseVector<16>::size]; #endif -#if defined(ARCH_PPC64EL) - __vector uint64_t ALIGN_ATTR(BaseVector<16>::size) u64x2[SIZE / BaseVector<16>::size]; - __vector int64_t ALIGN_ATTR(BaseVector<16>::size) s64x2[SIZE / BaseVector<16>::size]; - __vector uint32_t ALIGN_ATTR(BaseVector<16>::size) u32x4[SIZE / BaseVector<16>::size]; - __vector int32_t ALIGN_ATTR(BaseVector<16>::size) s32x4[SIZE / BaseVector<16>::size]; - __vector uint16_t ALIGN_ATTR(BaseVector<16>::size) u16x8[SIZE / BaseVector<16>::size]; - __vector int16_t ALIGN_ATTR(BaseVector<16>::size) s16x8[SIZE / BaseVector<16>::size]; - __vector uint8_t ALIGN_ATTR(BaseVector<16>::size) u8x16[SIZE / BaseVector<16>::size]; - __vector int8_t ALIGN_ATTR(BaseVector<16>::size) s8x16[SIZE / BaseVector<16>::size]; -#endif - uint64_t u64[SIZE / sizeof(uint64_t)]; int64_t s64[SIZE / sizeof(int64_t)]; uint32_t u32[SIZE / sizeof(uint32_t)]; @@ -200,7 +189,7 @@ public: } u; constexpr SuperVector() {}; - constexpr SuperVector(SuperVector const &other) + SuperVector(SuperVector const &other) :u(other.u) {}; SuperVector(typename base_type::type const v); diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp index 900078bb..bc2421dc 100644 --- a/unit/internal/simd_utils.cpp +++ b/unit/internal/simd_utils.cpp @@ -667,7 +667,7 @@ TEST(SimdUtilsTest, movq) { simd = _mm_set_epi64x(~0LL, 0x123456789abcdef); #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) int64x2_t a = { 0x123456789abcdefLL, ~0LL }; - simd = vreinterpretq_s64_s8(a); + simd = vreinterpretq_s32_s64(a); #elif defined(ARCH_PPC64EL) int64x2_t a = {0x123456789abcdefLL, ~0LL }; simd = (m128) a; diff --git a/util/CMakeLists.txt b/util/CMakeLists.txt index 82cee0ff..ea942ef1 100644 --- a/util/CMakeLists.txt +++ b/util/CMakeLists.txt @@ -33,9 +33,6 @@ SET(corpusomatic_SRCS ng_find_matches.cpp ) add_library(corpusomatic STATIC ${corpusomatic_SRCS}) -if (ARCH_IA32 OR ARCH_X86_64) - set_target_properties(corpusomatic PROPERTIES COMPILE_FLAGS "-mssse3") -endif () set(databaseutil_SRCS database_util.cpp