From 404a0ab0f4ea80a012b01dcce2d4a7bc12d4c821 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 1 Dec 2021 23:18:57 +0200 Subject: [PATCH 01/17] fix miscompilation with clang --- cmake/platform.cmake | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cmake/platform.cmake b/cmake/platform.cmake index 2cdc3a6e..5a2b85b2 100644 --- a/cmake/platform.cmake +++ b/cmake/platform.cmake @@ -1,3 +1,8 @@ +# determine compiler +if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") + set(CMAKE_COMPILER_IS_CLANG TRUE) +endif() + # determine the target arch if (CROSS_COMPILE_AARCH64) @@ -10,7 +15,7 @@ else() CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_IA32) CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_A64)\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_AARCH64) CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_ARM)\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_ARM32) - CHECK_C_SOURCE_COMPILES("#if !defined(__PPC64__) && !defined(__LITTLE_ENDIAN__) && !defined(__VSX__)\n#error not ppc64el\n#endif\nint main(void) { return 0; }" ARCH_PPC64EL) + CHECK_C_SOURCE_COMPILES("#if !defined(__PPC64__) && !(defined(__LITTLE_ENDIAN__) && defined(__VSX__))\n#error not ppc64el\n#endif\nint main(void) { return 0; }" ARCH_PPC64EL) if (ARCH_X86_64 OR ARCH_AARCH64 OR ARCH_PPC64EL) set(ARCH_64_BIT TRUE) else() From 7d600c4fcbb0c85f3082f164d969c245fc0a71d5 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 1 Dec 2021 23:19:43 +0200 Subject: [PATCH 02/17] bump base requirements to SSE4.2 --- cmake/arch.cmake | 14 +++++++------- src/util/arch/x86/simd_types.h | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 2100799f..29c39b49 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -88,7 +88,7 @@ if (FAT_RUNTIME) set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} ${SKYLAKE_FLAG}") endif (BUILD_AVX512VBMI) elseif (BUILD_AVX2) - set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} -march=core-avx2 -mavx") + set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} -march=core-avx2 -mavx2") elseif () set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} -march=core-i7 -mssse3") endif () @@ -98,12 +98,12 @@ else (NOT FAT_RUNTIME) endif () if (ARCH_IA32 OR ARCH_X86_64) - # ensure we have the minimum of SSSE3 - call a SSSE3 intrinsic + # ensure we have the minimum of SSE4.2 - call a SSE4.2 intrinsic CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}> int main() { __m128i a = _mm_set1_epi8(1); (void)_mm_shuffle_epi8(a, a); -}" HAVE_SSSE3) +}" HAVE_SSE42) # now look for AVX2 CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}> @@ -157,8 +157,8 @@ else () endif () if (FAT_RUNTIME) - if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_SSSE3) - message(FATAL_ERROR "SSSE3 support required to build fat runtime") + if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_SSE42) + message(FATAL_ERROR "SSE4.2 support required to build fat runtime") endif () if ((ARCH_IA32 OR ARCH_X86_64) AND BUILD_AVX2 AND NOT HAVE_AVX2) message(FATAL_ERROR "AVX2 support required to build fat runtime") @@ -179,8 +179,8 @@ else (NOT FAT_RUNTIME) if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_AVX512VBMI) message(STATUS "Building without AVX512VBMI support") endif () - if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_SSSE3) - message(FATAL_ERROR "A minimum of SSSE3 compiler support is required") + if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_SSE42) + message(FATAL_ERROR "A minimum of SSE4.2 compiler support is required") endif () if ((ARCH_ARM32 OR ARCH_AARCH64) AND NOT HAVE_NEON) message(FATAL_ERROR "NEON support required for ARM support") diff --git a/src/util/arch/x86/simd_types.h b/src/util/arch/x86/simd_types.h index c04e8dab..e1642404 100644 --- a/src/util/arch/x86/simd_types.h +++ b/src/util/arch/x86/simd_types.h @@ -30,7 +30,7 @@ #ifndef SIMD_TYPES_X86_H #define SIMD_TYPES_X86_H -#if !defined(m128) && defined(HAVE_SSE2) +#if !defined(m128) && defined(HAVE_SSE42) typedef __m128i m128; #endif From 0221dc1771716b50ec601cc21e9e769e184b9be2 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 1 Dec 2021 23:22:15 +0200 Subject: [PATCH 03/17] fix misompilations with clang++, as it is more strict --- src/util/supervector/arch/x86/impl.cpp | 54 +++++++++++++------------- src/util/supervector/supervector.hpp | 6 +-- 2 files changed, 29 insertions(+), 31 deletions(-) diff --git a/src/util/supervector/arch/x86/impl.cpp b/src/util/supervector/arch/x86/impl.cpp index b7686220..157f1dc4 100644 --- a/src/util/supervector/arch/x86/impl.cpp +++ b/src/util/supervector/arch/x86/impl.cpp @@ -55,56 +55,56 @@ really_inline SuperVector<16>::SuperVector(typename base_type::type const v) template<> template<> -really_inline SuperVector<16>::SuperVector(int8_t const other) +really_inline SuperVector<16>::SuperVector(int8_t const other) { u.v128[0] = _mm_set1_epi8(other); } template<> template<> -really_inline SuperVector<16>::SuperVector(uint8_t const other) +really_inline SuperVector<16>::SuperVector(uint8_t const other) { u.v128[0] = _mm_set1_epi8(static_cast(other)); } template<> template<> -really_inline SuperVector<16>::SuperVector(int16_t const other) +really_inline SuperVector<16>::SuperVector(int16_t const other) { u.v128[0] = _mm_set1_epi16(other); } template<> template<> -really_inline SuperVector<16>::SuperVector(uint16_t const other) +really_inline SuperVector<16>::SuperVector(uint16_t const other) { u.v128[0] = _mm_set1_epi16(static_cast(other)); } template<> template<> -really_inline SuperVector<16>::SuperVector(int32_t const other) +really_inline SuperVector<16>::SuperVector(int32_t const other) { u.v128[0] = _mm_set1_epi32(other); } template<> template<> -really_inline SuperVector<16>::SuperVector(uint32_t const other) +really_inline SuperVector<16>::SuperVector(uint32_t const other) { u.v128[0] = _mm_set1_epi32(static_cast(other)); } template<> template<> -really_inline SuperVector<16>::SuperVector(int64_t const other) +really_inline SuperVector<16>::SuperVector(int64_t const other) { u.v128[0] = _mm_set1_epi64x(other); } template<> template<> -really_inline SuperVector<16>::SuperVector(uint64_t const other) +really_inline SuperVector<16>::SuperVector(uint64_t const other) { u.v128[0] = _mm_set1_epi64x(static_cast(other)); } @@ -608,56 +608,56 @@ really_inline SuperVector<32>::SuperVector(SuperVector<16> const lo, SuperVector template<> template<> -really_inline SuperVector<32>::SuperVector(int8_t const other) +really_inline SuperVector<32>::SuperVector(int8_t const other) { u.v256[0] = _mm256_set1_epi8(other); } template<> template<> -really_inline SuperVector<32>::SuperVector(uint8_t const other) +really_inline SuperVector<32>::SuperVector(uint8_t const other) { u.v256[0] = _mm256_set1_epi8(static_cast(other)); } template<> template<> -really_inline SuperVector<32>::SuperVector(int16_t const other) +really_inline SuperVector<32>::SuperVector(int16_t const other) { u.v256[0] = _mm256_set1_epi16(other); } template<> template<> -really_inline SuperVector<32>::SuperVector(uint16_t const other) +really_inline SuperVector<32>::SuperVector(uint16_t const other) { u.v256[0] = _mm256_set1_epi16(static_cast(other)); } template<> template<> -really_inline SuperVector<32>::SuperVector(int32_t const other) +really_inline SuperVector<32>::SuperVector(int32_t const other) { u.v256[0] = _mm256_set1_epi32(other); } template<> template<> -really_inline SuperVector<32>::SuperVector(uint32_t const other) +really_inline SuperVector<32>::SuperVector(uint32_t const other) { u.v256[0] = _mm256_set1_epi32(static_cast(other)); } template<> template<> -really_inline SuperVector<32>::SuperVector(int64_t const other) +really_inline SuperVector<32>::SuperVector(int64_t const other) { u.v256[0] = _mm256_set1_epi64x(other); } template<> template<> -really_inline SuperVector<32>::SuperVector(uint64_t const other) +really_inline SuperVector<32>::SuperVector(uint64_t const other) { u.v256[0] = _mm256_set1_epi64x(static_cast(other)); } @@ -804,7 +804,7 @@ really_inline SuperVector<32> SuperVector<32>::vshl_128_imm() const template <> template -really_inline SuperVector<16> SuperVector<32>::vshl_256_imm() const +really_inline SuperVector<32> SuperVector<32>::vshl_256_imm() const { if (N == 0) return *this; if (N == 16) return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0))}; @@ -950,11 +950,11 @@ really_inline SuperVector<32> SuperVector<32>::vshl_256(uint8_t const N) const SuperVector result; Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; - if (N == n) result = {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 16 - n)};; + if (N == n) result = {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(v->u.v256[0], v->u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 16 - n)};; }); Unroller<17, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; - if (N == n) result = {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), n - 16)}; + if (N == n) result = {_mm256_slli_si256(_mm256_permute2x128_si256(v->u.v256[0], v->u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), n - 16)}; }); return result; } @@ -1240,56 +1240,56 @@ really_inline SuperVector<64>::SuperVector(m128 const v) template<> template<> -really_inline SuperVector<64>::SuperVector(int8_t const o) +really_inline SuperVector<64>::SuperVector(int8_t const o) { u.v512[0] = _mm512_set1_epi8(o); } template<> template<> -really_inline SuperVector<64>::SuperVector(uint8_t const o) +really_inline SuperVector<64>::SuperVector(uint8_t const o) { u.v512[0] = _mm512_set1_epi8(static_cast(o)); } template<> template<> -really_inline SuperVector<64>::SuperVector(int16_t const o) +really_inline SuperVector<64>::SuperVector(int16_t const o) { u.v512[0] = _mm512_set1_epi16(o); } template<> template<> -really_inline SuperVector<64>::SuperVector(uint16_t const o) +really_inline SuperVector<64>::SuperVector(uint16_t const o) { u.v512[0] = _mm512_set1_epi16(static_cast(o)); } template<> template<> -really_inline SuperVector<64>::SuperVector(int32_t const o) +really_inline SuperVector<64>::SuperVector(int32_t const o) { u.v512[0] = _mm512_set1_epi32(o); } template<> template<> -really_inline SuperVector<64>::SuperVector(uint32_t const o) +really_inline SuperVector<64>::SuperVector(uint32_t const o) { u.v512[0] = _mm512_set1_epi32(static_cast(o)); } template<> template<> -really_inline SuperVector<64>::SuperVector(int64_t const o) +really_inline SuperVector<64>::SuperVector(int64_t const o) { u.v512[0] = _mm512_set1_epi64(o); } template<> template<> -really_inline SuperVector<64>::SuperVector(uint64_t const o) +really_inline SuperVector<64>::SuperVector(uint64_t const o) { u.v512[0] = _mm512_set1_epi64(static_cast(o)); } diff --git a/src/util/supervector/supervector.hpp b/src/util/supervector/supervector.hpp index 737412f6..3ab3b13f 100644 --- a/src/util/supervector/supervector.hpp +++ b/src/util/supervector/supervector.hpp @@ -174,9 +174,7 @@ public: int16x8_t ALIGN_ATTR(BaseVector<16>::size) s16x8[SIZE / BaseVector<16>::size]; uint8x16_t ALIGN_ATTR(BaseVector<16>::size) u8x16[SIZE / BaseVector<16>::size]; int8x16_t ALIGN_ATTR(BaseVector<16>::size) s8x16[SIZE / BaseVector<16>::size]; -#endif - -#if defined(ARCH_PPC64EL) +#elif defined(ARCH_PPC64EL) __vector uint64_t ALIGN_ATTR(BaseVector<16>::size) u64x2[SIZE / BaseVector<16>::size]; __vector int64_t ALIGN_ATTR(BaseVector<16>::size) s64x2[SIZE / BaseVector<16>::size]; __vector uint32_t ALIGN_ATTR(BaseVector<16>::size) u32x4[SIZE / BaseVector<16>::size]; @@ -200,7 +198,7 @@ public: } u; constexpr SuperVector() {}; - constexpr SuperVector(SuperVector const &other) + SuperVector(SuperVector const &other) :u(other.u) {}; SuperVector(typename base_type::type const v); From 1f4143de81fab6619a44aa6ae175e1cec2e51992 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 1 Dec 2021 23:23:37 +0200 Subject: [PATCH 04/17] rework CMakeLists.txt to ensure it works with clang --- CMakeLists.txt | 284 ++++++++++++++++++++++++++----------------------- 1 file changed, 153 insertions(+), 131 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a741961c..90395329 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,7 +3,7 @@ project (vectorscan C CXX) set (HS_MAJOR_VERSION 5) set (HS_MINOR_VERSION 4) -set (HS_PATCH_VERSION 3) +set (HS_PATCH_VERSION 5) set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION}) set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) @@ -128,11 +128,9 @@ CMAKE_DEPENDENT_OPTION(DUMP_SUPPORT "Dump code support; normally on, except in r CMAKE_DEPENDENT_OPTION(DISABLE_ASSERTS "Disable assert(); Asserts are enabled in debug builds, disabled in release builds" OFF "NOT RELEASE_BUILD" ON) -option(BUILD_AVX512 "Experimental: support avx512 in the fat runtime" - OFF) +option(BUILD_AVX512 "Experimental: support avx512 in the fat runtime" OFF) -option(BUILD_AVX512VBMI "Experimental: support avx512vbmi in the fat runtime" - OFF) +option(BUILD_AVX512VBMI "Experimental: support avx512vbmi in the fat runtime" OFF) if (BUILD_AVX512VBMI) set(BUILD_AVX512 ON) @@ -140,47 +138,71 @@ endif () # TODO: per platform config files? - # remove CMake's idea of optimisation - foreach (CONFIG ${CMAKE_BUILD_TYPE} ${CMAKE_CONFIGURATION_TYPES}) - string(REGEX REPLACE "-O[^ ]*" "" CMAKE_C_FLAGS_${CONFIG} "${CMAKE_C_FLAGS_${CONFIG}}") - string(REGEX REPLACE "-O[^ ]*" "" CMAKE_CXX_FLAGS_${CONFIG} "${CMAKE_CXX_FLAGS_${CONFIG}}") - endforeach () +# remove CMake's idea of optimisation +foreach (CONFIG ${CMAKE_BUILD_TYPE} ${CMAKE_CONFIGURATION_TYPES}) + string(REGEX REPLACE "-O[^ ]*" "" CMAKE_C_FLAGS_${CONFIG} "${CMAKE_C_FLAGS_${CONFIG}}") + string(REGEX REPLACE "-O[^ ]*" "" CMAKE_CXX_FLAGS_${CONFIG} "${CMAKE_CXX_FLAGS_${CONFIG}}") +endforeach () - if (CMAKE_COMPILER_IS_GNUCC AND NOT CROSS_COMPILE_AARCH64 AND NOT ARCH_PPC64EL) - message(STATUS "gcc version ${CMAKE_C_COMPILER_VERSION}") - # If gcc doesn't recognise the host cpu, then mtune=native becomes - # generic, which isn't very good in some cases. march=native looks at - # cpuid info and then chooses the best microarch it can (and replaces - # the flag), so use that for tune. +if (CMAKE_C_COMPILER_ID MATCHES "Intel") + set(SKYLAKE_FLAG "-xCORE-AVX512") +else () + set(SKYLAKE_FLAG "-march=skylake-avx512") + set(ICELAKE_FLAG "-march=icelake-server") +endif () - # arg1 might exist if using ccache - string (STRIP "${CMAKE_C_COMPILER_ARG1}" CC_ARG1) - set (EXEC_ARGS ${CC_ARG1} -c -Q --help=target -march=native -mtune=native) - execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS} - OUTPUT_VARIABLE _GCC_OUTPUT) - string(FIND "${_GCC_OUTPUT}" "march" POS) - string(SUBSTRING "${_GCC_OUTPUT}" ${POS} -1 _GCC_OUTPUT) - string(REGEX REPLACE "march=[ \t]*([^ \n]*)[ \n].*" "\\1" - GNUCC_ARCH "${_GCC_OUTPUT}") +# Detect best GNUCC_ARCH to tune for +if (CMAKE_COMPILER_IS_GNUCC AND NOT CROSS_COMPILE) + message(STATUS "gcc version ${CMAKE_C_COMPILER_VERSION}") + # If gcc doesn't recognise the host cpu, then mtune=native becomes + # generic, which isn't very good in some cases. march=native looks at + # cpuid info and then chooses the best microarch it can (and replaces + # the flag), so use that for tune. - if (ARCH_IA32 OR ARCH_X86_64) - # test the parsed flag - set (EXEC_ARGS ${CC_ARG1} -E - -mtune=${GNUCC_ARCH}) - execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS} - OUTPUT_QUIET ERROR_QUIET - INPUT_FILE /dev/null - RESULT_VARIABLE GNUCC_TUNE_TEST) - if (NOT GNUCC_TUNE_TEST EQUAL 0) - message(SEND_ERROR "Something went wrong determining gcc tune: -mtune=${GNUCC_ARCH} not valid") - endif() - set(TUNE_FLAG ${GNUCC_ARCH}) - else() - set(TUNE_FLAG native) - endif() - elseif (NOT TUNE_FLAG) + # arg1 might exist if using ccache + string (STRIP "${CMAKE_C_COMPILER_ARG1}" CC_ARG1) + set (EXEC_ARGS ${CC_ARG1} -c -Q --help=target -march=native -mtune=native) + execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS} + OUTPUT_VARIABLE _GCC_OUTPUT) + string(FIND "${_GCC_OUTPUT}" "march" POS) + string(SUBSTRING "${_GCC_OUTPUT}" ${POS} -1 _GCC_OUTPUT) + string(REGEX REPLACE "march=[ \t]*([^ \n]*)[ \n].*" "\\1" GNUCC_ARCH "${_GCC_OUTPUT}") + + # test the parsed flag + set (EXEC_ARGS ${CC_ARG1} -E - -mtune=${GNUCC_ARCH}) + execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS} + OUTPUT_QUIET ERROR_QUIET + INPUT_FILE /dev/null + RESULT_VARIABLE GNUCC_TUNE_TEST) + if (NOT GNUCC_TUNE_TEST EQUAL 0) + message(SEND_ERROR "Something went wrong determining gcc tune: -mtune=${GNUCC_ARCH} not valid") set(TUNE_FLAG native) + else() + set(TUNE_FLAG ${GNUCC_ARCH}) endif() + message(STATUS "gcc will tune for ${GNUCC_ARCH}") +elseif (CMAKE_COMPILER_IS_CLANG AND NOT CROSS_COMPILE) + set(GNUCC_ARCH native) + set(TUNE_FLAG generic) + message(STATUS "clang will tune for ${TUNE_FLAG}") + if (BUILD_AVX512) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SKYLAKE_FLAG}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SKYLAKE_FLAG}") + elseif (BUILD_AVX2) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2") + else() + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse4.2") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2") + endif() + message(STATUS "${CMAKE_C_FLAGS}") + message(STATUS "${CMAKE_CXX_FLAGS}") +elseif (CROSS_COMPILE) + set(GNUCC_ARCH generic) + set(TUNE_FLAG generic) +endif() +if (ARCH_AARCH64) if (BUILD_SVE2_BITPERM) set(GNUCC_ARCH "${GNUCC_ARCH}+sve2-bitperm") elseif (BUILD_SVE2) @@ -188,92 +210,88 @@ endif () elseif (BUILD_SVE) set(GNUCC_ARCH "${GNUCC_ARCH}+sve") endif () +endif(ARCH_AARCH64) - # compiler version checks TODO: test more compilers - if (CMAKE_COMPILER_IS_GNUCXX) - set(GNUCXX_MINVER "4.8.1") - message(STATUS "g++ version ${CMAKE_CXX_COMPILER_VERSION}") - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS GNUCXX_MINVER) - message(FATAL_ERROR "A minimum of g++ ${GNUCXX_MINVER} is required for C++11 support") - endif() - endif() - - if(RELEASE_BUILD) - if (NOT CMAKE_BUILD_TYPE MATCHES MINSIZEREL) - set(OPT_C_FLAG "-O3") - set(OPT_CXX_FLAG "-O3") - else () - set(OPT_C_FLAG "-Os") - set(OPT_CXX_FLAG "-Os") - endif () - else() - set(OPT_C_FLAG "-O0") - set(OPT_CXX_FLAG "-O0") - endif(RELEASE_BUILD) - - # set compiler flags - more are tested and added later - set(EXTRA_C_FLAGS "${OPT_C_FLAG} -std=c17 -Wall -Wextra -Wshadow -Wcast-qual -fno-strict-aliasing") - set(EXTRA_CXX_FLAGS "${OPT_CXX_FLAG} -std=c++17 -Wall -Wextra -Wshadow -Wswitch -Wreturn-type -Wcast-qual -Wno-deprecated -Wnon-virtual-dtor -fno-strict-aliasing -fno-new-ttp-matching") - - if (NOT RELEASE_BUILD) - # -Werror is most useful during development, don't potentially break - # release builds - set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Werror") - set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Werror") - endif() - - if (DISABLE_ASSERTS) - set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -DNDEBUG") - set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -DNDEBUG") - endif() - - - if (ARCH_IA32 OR ARCH_X86_64 OR ARCH_ARM32 OR ARCH_AARCH64) - if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*) +if (ARCH_IA32 OR ARCH_X86_64 OR ARCH_ARM32 OR ARCH_AARCH64) + if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*) set(ARCH_C_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}") - endif() + endif() - if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*) - set(ARCH_CXX_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}") - endif() + if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*) + set(ARCH_CXX_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}") endif() +endif() - if(ARCH_PPC64EL) - if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*) - set(ARCH_C_FLAGS "-mtune=${TUNE_FLAG}") - endif() - if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*) - set(ARCH_CXX_FLAGS "-mtune=${TUNE_FLAG}") - endif() +if(ARCH_PPC64EL) + if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*) + set(ARCH_C_FLAGS "-mtune=${TUNE_FLAG}") endif() - - if(CMAKE_COMPILER_IS_GNUCC) - # spurious warnings? - set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-array-bounds -Wno-maybe-uninitialized") + if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*) + set(ARCH_CXX_FLAGS "-mtune=${TUNE_FLAG}") endif() +endif() - if(CMAKE_COMPILER_IS_GNUCXX) - set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-maybe-uninitialized") - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0) - set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fabi-version=0") - endif () - # don't complain about abi - set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-abi") - set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-abi") +# compiler version checks TODO: test more compilers +if (CMAKE_COMPILER_IS_GNUCXX) + set(GNUCXX_MINVER "10") + message(STATUS "g++ version ${CMAKE_CXX_COMPILER_VERSION}") + if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS GNUCXX_MINVER) + message(FATAL_ERROR "A minimum of g++ ${GNUCXX_MINVER} is required for C++17 support") endif() +endif() - if (NOT(ARCH_IA32 AND RELEASE_BUILD)) - set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -fno-omit-frame-pointer") - set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fno-omit-frame-pointer") - endif() - - - if (CMAKE_C_COMPILER_ID MATCHES "Intel") - set(SKYLAKE_FLAG "-xCORE-AVX512") +if(RELEASE_BUILD) + if (NOT CMAKE_BUILD_TYPE MATCHES MINSIZEREL) + set(OPT_C_FLAG "-O3") + set(OPT_CXX_FLAG "-O3") else () - set(SKYLAKE_FLAG "-march=skylake-avx512") - set(ICELAKE_FLAG "-march=icelake-server") + set(OPT_C_FLAG "-Os") + set(OPT_CXX_FLAG "-Os") endif () +else() + set(OPT_C_FLAG "-O0") + set(OPT_CXX_FLAG "-O0") +endif(RELEASE_BUILD) + +# set compiler flags - more are tested and added later +set(EXTRA_C_FLAGS "${OPT_C_FLAG} -std=c17 -Wall -Wextra -Wshadow -Wcast-qual -fno-strict-aliasing") +set(EXTRA_CXX_FLAGS "${OPT_CXX_FLAG} -std=c++17 -Wall -Wextra -Wshadow -Wswitch -Wreturn-type -Wcast-qual -Wno-deprecated -Wnon-virtual-dtor -fno-strict-aliasing") +if (NOT CMAKE_COMPILER_IS_CLANG) + set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fno-new-ttp-matching") +endif() + +if (NOT RELEASE_BUILD) + # -Werror is most useful during development, don't potentially break + # release builds + set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Werror") + set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Werror") +endif() + +if (DISABLE_ASSERTS) + set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -DNDEBUG") + set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -DNDEBUG") +endif() + +if(CMAKE_COMPILER_IS_GNUCC) + # spurious warnings? + set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-array-bounds -Wno-maybe-uninitialized") +endif() + +if(CMAKE_COMPILER_IS_GNUCXX) + set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-maybe-uninitialized") + if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0) + set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fabi-version=0") + endif () + # don't complain about abi + set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-abi") + set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-abi") +endif() + +if (NOT(ARCH_IA32 AND RELEASE_BUILD)) + set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -fno-omit-frame-pointer") + set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fno-omit-frame-pointer") +endif() + CHECK_INCLUDE_FILES(unistd.h HAVE_UNISTD_H) if (ARCH_IA32 OR ARCH_X86_64) @@ -289,8 +307,6 @@ elseif (ARCH_ARM32 OR ARCH_AARCH64) message(FATAL_ERROR "arm_sve.h is required to build for SVE.") endif() endif() - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -flax-vector-conversions") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -flax-vector-conversions") elseif (ARCH_PPC64EL) CHECK_INCLUDE_FILE_CXX(altivec.h HAVE_C_PPC64EL_ALTIVEC_H) endif() @@ -318,8 +334,7 @@ if (CMAKE_SYSTEM_NAME MATCHES "Linux") # This is a Linux-only feature for now - requires platform support # elsewhere message(STATUS "generator is ${CMAKE_GENERATOR}") - if (CMAKE_C_COMPILER_ID MATCHES "Clang" AND - CMAKE_C_COMPILER_VERSION VERSION_LESS "3.9") + if (CMAKE_C_COMPILER_IS_CLANG AND CMAKE_C_COMPILER_VERSION VERSION_LESS "3.9") message (STATUS "Clang v3.9 or higher required for fat runtime, cannot build fat runtime") set (FAT_RUNTIME_REQUISITES FALSE) elseif (NOT (CMAKE_GENERATOR MATCHES "Unix Makefiles" OR @@ -343,7 +358,10 @@ include (${CMAKE_MODULE_PATH}/arch.cmake) # testing a builtin takes a little more work CHECK_C_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CC_BUILTIN_ASSUME_ALIGNED) CHECK_CXX_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CXX_BUILTIN_ASSUME_ALIGNED) -CHECK_C_SOURCE_COMPILES("int main(void) { __builtin_constant_p(0); }" HAVE__BUILTIN_CONSTANT_P) +# Clang does not use __builtin_constant_p() the same way as gcc +if (NOT CMAKE_COMPILER_IS_CLANG) + CHECK_C_SOURCE_COMPILES("int main(void) { __builtin_constant_p(0); }" HAVE__BUILTIN_CONSTANT_P) +endif() set(C_FLAGS_TO_CHECK # Variable length arrays are way bad, most especially at run time @@ -442,18 +460,22 @@ if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD") set(FREEBSD true) endif(CMAKE_SYSTEM_NAME MATCHES "FreeBSD") -if (NOT FAT_RUNTIME) - if (CROSS_COMPILE_AARCH64) +if (FAT_RUNTIME) + if (NOT (ARCH_IA32 OR ARCH_X86_64)) + message(FATAL_ERROR "Fat runtime is not supported on non-Intel architectures") + else() + message(STATUS "Building runtime for multiple microarchitectures") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + endif() +else() + if (CROSS_COMPILE) message(STATUS "Building for target CPU: ${ARCH_C_FLAGS}") else() message(STATUS "Building for current host CPU: ${ARCH_C_FLAGS}") endif() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ARCH_C_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_CXX_FLAGS}") -else() - message(STATUS "Building runtime for multiple microarchitectures") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") endif() add_subdirectory(util) @@ -1171,8 +1193,8 @@ if (NOT FAT_RUNTIME) set_target_properties(hs_runtime PROPERTIES LINKER_LANGUAGE C) add_library(hs_compile OBJECT ${hs_compile_SRCS}) - if (ARCH_IA32) - set_target_properties(hs_compile PROPERTIES COMPILE_FLAGS "-mssse3") + if (ARCH_IA32) + set_target_properties(hs_compile PROPERTIES COMPILE_FLAGS "-msse4.2") endif (ARCH_IA32) add_library(hs STATIC @@ -1212,7 +1234,7 @@ else (FAT_RUNTIME) add_library(hs_exec_corei7 OBJECT ${hs_exec_SRCS}) list(APPEND RUNTIME_LIBS $) set_target_properties(hs_exec_corei7 PROPERTIES - COMPILE_FLAGS "-march=corei7 -mssse3" + COMPILE_FLAGS "-march=corei7 -msse4.2" RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} corei7 ${CMAKE_MODULE_PATH}/keep.syms.in" ) @@ -1255,8 +1277,8 @@ else (FAT_RUNTIME) set_target_properties(hs_runtime PROPERTIES LINKER_LANGUAGE C) add_library(hs_compile OBJECT ${hs_compile_SRCS}) if (ARCH_IA32 OR ARCH_X86_64) - set_target_properties(hs_exec_common PROPERTIES COMPILE_FLAGS "-mssse3") - set_target_properties(hs_compile PROPERTIES COMPILE_FLAGS "-mssse3") + set_target_properties(hs_exec_common PROPERTIES COMPILE_FLAGS "-msse4.2") + set_target_properties(hs_compile PROPERTIES COMPILE_FLAGS "-msse4.2") endif () # we want the static lib for testing @@ -1281,7 +1303,7 @@ else (FAT_RUNTIME) add_library(hs_exec_shared_corei7 OBJECT ${hs_exec_SRCS}) list(APPEND RUNTIME_SHLIBS $) set_target_properties(hs_exec_shared_corei7 PROPERTIES - COMPILE_FLAGS "-march=corei7 -mssse3" + COMPILE_FLAGS "-march=corei7 -msse4.2" POSITION_INDEPENDENT_CODE TRUE RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} corei7 ${CMAKE_MODULE_PATH}/keep.syms.in" ) From 5d23e6dab67473f34d5814ba2c9967d19ae11dbd Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 1 Dec 2021 21:45:31 +0000 Subject: [PATCH 05/17] set -msse4.2 only on Intel --- CMakeLists.txt | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 90395329..d61b4a4a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -182,21 +182,30 @@ if (CMAKE_COMPILER_IS_GNUCC AND NOT CROSS_COMPILE) endif() message(STATUS "gcc will tune for ${GNUCC_ARCH}") elseif (CMAKE_COMPILER_IS_CLANG AND NOT CROSS_COMPILE) - set(GNUCC_ARCH native) - set(TUNE_FLAG generic) message(STATUS "clang will tune for ${TUNE_FLAG}") - if (BUILD_AVX512) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SKYLAKE_FLAG}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SKYLAKE_FLAG}") - elseif (BUILD_AVX2) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2") + if (ARCH_IA32 OR ARCH_X86_64) + set(GNUCC_ARCH native) + set(TUNE_FLAG generic) + if (BUILD_AVX512) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SKYLAKE_FLAG}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SKYLAKE_FLAG}") + elseif (BUILD_AVX2) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2") + else() + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse4.2") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2") + endif() + elseif(ARCH_AARCH64) + set(GNUCC_ARCH armv8) + set(TUNE_FLAG generic) + elseif(ARCH_ARM32) + set(GNUCC_ARCH armv7a) + set(TUNE_FLAG generic) else() - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse4.2") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2") + set(GNUCC_ARCH native) + set(TUNE_FLAG generic) endif() - message(STATUS "${CMAKE_C_FLAGS}") - message(STATUS "${CMAKE_CXX_FLAGS}") elseif (CROSS_COMPILE) set(GNUCC_ARCH generic) set(TUNE_FLAG generic) @@ -214,10 +223,9 @@ endif(ARCH_AARCH64) if (ARCH_IA32 OR ARCH_X86_64 OR ARCH_ARM32 OR ARCH_AARCH64) if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*) - set(ARCH_C_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}") + set(ARCH_C_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}") endif() - - if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*) + if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*) set(ARCH_CXX_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}") endif() endif() From 4aa32275f16282829cc58b9efb1c50dcabd53d14 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 2 Dec 2021 18:00:02 +0200 Subject: [PATCH 06/17] use same definition of the union for all types --- src/util/supervector/supervector.hpp | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/src/util/supervector/supervector.hpp b/src/util/supervector/supervector.hpp index 3ab3b13f..f0ddf63c 100644 --- a/src/util/supervector/supervector.hpp +++ b/src/util/supervector/supervector.hpp @@ -165,7 +165,7 @@ public: typename BaseVector<32>::type ALIGN_ATTR(BaseVector<32>::size) v256[SIZE / BaseVector<32>::size]; typename BaseVector<64>::type ALIGN_ATTR(BaseVector<64>::size) v512[SIZE / BaseVector<64>::size]; -#if defined(ARCH_ARM32) || defined(ARCH_AARCH64) +#if defined(ARCH_ARM32) || defined(ARCH_AARCH64) || defined(ARCH_PPC64EL) uint64x2_t ALIGN_ATTR(BaseVector<16>::size) u64x2[SIZE / BaseVector<16>::size]; int64x2_t ALIGN_ATTR(BaseVector<16>::size) s64x2[SIZE / BaseVector<16>::size]; uint32x4_t ALIGN_ATTR(BaseVector<16>::size) u32x4[SIZE / BaseVector<16>::size]; @@ -174,15 +174,6 @@ public: int16x8_t ALIGN_ATTR(BaseVector<16>::size) s16x8[SIZE / BaseVector<16>::size]; uint8x16_t ALIGN_ATTR(BaseVector<16>::size) u8x16[SIZE / BaseVector<16>::size]; int8x16_t ALIGN_ATTR(BaseVector<16>::size) s8x16[SIZE / BaseVector<16>::size]; -#elif defined(ARCH_PPC64EL) - __vector uint64_t ALIGN_ATTR(BaseVector<16>::size) u64x2[SIZE / BaseVector<16>::size]; - __vector int64_t ALIGN_ATTR(BaseVector<16>::size) s64x2[SIZE / BaseVector<16>::size]; - __vector uint32_t ALIGN_ATTR(BaseVector<16>::size) u32x4[SIZE / BaseVector<16>::size]; - __vector int32_t ALIGN_ATTR(BaseVector<16>::size) s32x4[SIZE / BaseVector<16>::size]; - __vector uint16_t ALIGN_ATTR(BaseVector<16>::size) u16x8[SIZE / BaseVector<16>::size]; - __vector int16_t ALIGN_ATTR(BaseVector<16>::size) s16x8[SIZE / BaseVector<16>::size]; - __vector uint8_t ALIGN_ATTR(BaseVector<16>::size) u8x16[SIZE / BaseVector<16>::size]; - __vector int8_t ALIGN_ATTR(BaseVector<16>::size) s8x16[SIZE / BaseVector<16>::size]; #endif uint64_t u64[SIZE / sizeof(uint64_t)]; From 5aae719ecdeea8b917176956555e67fc58bc27be Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 2 Dec 2021 18:01:00 +0200 Subject: [PATCH 07/17] fix build with clang, in particular VSX uses long long instead of int64_t, gcc allows this, clang does not --- src/util/arch/ppc64el/simd_types.h | 2 +- src/util/arch/ppc64el/simd_utils.h | 22 ++++++-- src/util/supervector/arch/ppc64el/impl.cpp | 62 +++++++++------------ src/util/supervector/arch/ppc64el/types.hpp | 14 ++++- 4 files changed, 57 insertions(+), 43 deletions(-) diff --git a/src/util/arch/ppc64el/simd_types.h b/src/util/arch/ppc64el/simd_types.h index 21dae5cb..8a5b0e25 100644 --- a/src/util/arch/ppc64el/simd_types.h +++ b/src/util/arch/ppc64el/simd_types.h @@ -30,7 +30,7 @@ #define ARCH_PPC64EL_SIMD_TYPES_H #if !defined(m128) && defined(HAVE_VSX) -typedef __vector int32_t m128; +typedef __vector int m128; #endif #endif /* ARCH_PPC64EL_SIMD_TYPES_H */ diff --git a/src/util/arch/ppc64el/simd_utils.h b/src/util/arch/ppc64el/simd_utils.h index 137fc94f..d046ed47 100644 --- a/src/util/arch/ppc64el/simd_utils.h +++ b/src/util/arch/ppc64el/simd_utils.h @@ -43,6 +43,18 @@ #include // for memcpy +typedef __vector unsigned long long int uint64x2_t; +typedef __vector signed long long int int64x2_t; +typedef __vector unsigned int uint32x4_t; +typedef __vector signed int int32x4_t; +typedef __vector unsigned short int uint16x8_t; +typedef __vector signed short int int16x8_t; +typedef __vector unsigned char uint8x16_t; +typedef __vector signed char int8x16_t; + +typedef unsigned long long int ulong64_t; +typedef signed long long int long64_t; +/* typedef __vector uint64_t uint64x2_t; typedef __vector int64_t int64x2_t; typedef __vector uint32_t uint32x4_t; @@ -50,7 +62,7 @@ typedef __vector int32_t int32x4_t; typedef __vector uint16_t uint16x8_t; typedef __vector int16_t int16x8_t; typedef __vector uint8_t uint8x16_t; -typedef __vector int8_t int8x16_t; +typedef __vector int8_t int8x16_t;*/ #define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0 @@ -182,13 +194,13 @@ m128 rshift_m128(m128 a, unsigned b) { static really_really_inline m128 lshift64_m128(m128 a, unsigned b) { - uint64x2_t shift_indices = vec_splats((uint64_t)b); + uint64x2_t shift_indices = vec_splats((ulong64_t)b); return (m128) vec_sl((int64x2_t)a, shift_indices); } static really_really_inline m128 rshift64_m128(m128 a, unsigned b) { - uint64x2_t shift_indices = vec_splats((uint64_t)b); + uint64x2_t shift_indices = vec_splats((ulong64_t)b); return (m128) vec_sr((int64x2_t)a, shift_indices); } @@ -213,11 +225,11 @@ static really_inline u32 movemask128(m128 a) { uint32x4_t s3 = vec_or((uint32x4_t)ss2, res_and2); uint64x2_t ss3 = vec_sr((uint64x2_t)s3, (uint64x2_t)vec_splats(28)); - uint64x2_t res_and3 = vec_and((uint64x2_t)s3, vec_splats((uint64_t)0xff)); + uint64x2_t res_and3 = vec_and((uint64x2_t)s3, vec_splats((ulong64_t)0xff)); uint64x2_t s4 = vec_or((uint64x2_t)ss3, res_and3); uint64x2_t ss4 = vec_sld((uint64x2_t)vec_splats(0), s4, 9); - uint64x2_t res_and4 = vec_and((uint64x2_t)s4, vec_splats((uint64_t)0xff)); + uint64x2_t res_and4 = vec_and((uint64x2_t)s4, vec_splats((ulong64_t)0xff)); uint64x2_t s5 = vec_or((uint64x2_t)ss4, res_and4); return s5[0]; diff --git a/src/util/supervector/arch/ppc64el/impl.cpp b/src/util/supervector/arch/ppc64el/impl.cpp index e054e02e..109b8d5e 100644 --- a/src/util/supervector/arch/ppc64el/impl.cpp +++ b/src/util/supervector/arch/ppc64el/impl.cpp @@ -39,16 +39,6 @@ #include "util/supervector/supervector.hpp" #include - -typedef __vector uint64_t uint64x2_t; -typedef __vector int64_t int64x2_t; -typedef __vector uint32_t uint32x4_t; -typedef __vector int32_t int32x4_t; -typedef __vector uint16_t uint16x8_t; -typedef __vector int16_t int16x8_t; -typedef __vector uint8_t uint8x16_t; -typedef __vector int8_t int8x16_t; - // 128-bit Powerpc64le implementation template<> @@ -65,58 +55,58 @@ really_inline SuperVector<16>::SuperVector(typename base_type::type const v) template<> template<> -really_inline SuperVector<16>::SuperVector(int8_t const other) +really_inline SuperVector<16>::SuperVector(int8_t const other) { u.v128[0] = (m128) vec_splats(other); } template<> template<> -really_inline SuperVector<16>::SuperVector(uint8_t const other) +really_inline SuperVector<16>::SuperVector(uint8_t const other) { u.v128[0] = (m128) vec_splats(static_cast(other)); } template<> template<> -really_inline SuperVector<16>::SuperVector(int16_t const other) +really_inline SuperVector<16>::SuperVector(int16_t const other) { u.v128[0] = (m128) vec_splats(other); } template<> template<> -really_inline SuperVector<16>::SuperVector(uint16_t const other) +really_inline SuperVector<16>::SuperVector(uint16_t const other) { u.v128[0] = (m128) vec_splats(static_cast(other)); } template<> template<> -really_inline SuperVector<16>::SuperVector(int32_t const other) +really_inline SuperVector<16>::SuperVector(int32_t const other) { u.v128[0] = (m128) vec_splats(other); } template<> template<> -really_inline SuperVector<16>::SuperVector(uint32_t const other) +really_inline SuperVector<16>::SuperVector(uint32_t const other) { u.v128[0] = (m128) vec_splats(static_cast(other)); } template<> template<> -really_inline SuperVector<16>::SuperVector(int64_t const other) +really_inline SuperVector<16>::SuperVector(int64_t const other) { - u.v128[0] = (m128) vec_splats(other); + u.v128[0] = (m128) vec_splats(static_cast(other)); } template<> template<> -really_inline SuperVector<16>::SuperVector(uint64_t const other) +really_inline SuperVector<16>::SuperVector(uint64_t const other) { - u.v128[0] = (m128) vec_splats(static_cast(other)); + u.v128[0] = (m128) vec_splats(static_cast(other)); } // Constants @@ -229,11 +219,11 @@ really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask( uint32x4_t s3 = vec_or((uint32x4_t)ss2, res_and2); uint64x2_t ss3 = vec_sr((uint64x2_t)s3, (uint64x2_t)vec_splats(28)); - uint64x2_t res_and3 = vec_and((uint64x2_t)s3, vec_splats((uint64_t)0xff)); + uint64x2_t res_and3 = vec_and((uint64x2_t)s3, vec_splats((ulong64_t)0xff)); uint64x2_t s4 = vec_or((uint64x2_t)ss3, res_and3); uint64x2_t ss4 = vec_sld((uint64x2_t) vec_splats(0), s4, 9); - uint64x2_t res_and4 = vec_and((uint64x2_t)s4, vec_splats((uint64_t)0xff)); + uint64x2_t res_and4 = vec_and((uint64x2_t)s4, vec_splats((ulong64_t)0xff)); uint64x2_t s5 = vec_or((uint64x2_t)ss4, res_and4); return s5[0]; @@ -271,7 +261,7 @@ template <> template really_inline SuperVector<16> SuperVector<16>::vshl_64_imm() const { - return { (m128) vec_sl(u.s64x2[0], vec_splats((uint64_t)N)) }; + return { (m128) vec_sl(u.s64x2[0], vec_splats((ulong64_t)N)) }; } template <> @@ -313,7 +303,7 @@ template <> template really_inline SuperVector<16> SuperVector<16>::vshr_64_imm() const { - return { (m128) vec_sr(u.s64x2[0], vec_splats((uint64_t)N)) }; + return { (m128) vec_sr(u.s64x2[0], vec_splats((ulong64_t)N)) }; } template <> @@ -352,7 +342,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_8 (uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(u.s8x16[0], vec_splats((uint8_t)n))}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s8x16[0], vec_splats((uint8_t)n))}; }); return result; } @@ -362,7 +352,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const UNUSED N) if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(u.s16x8[0], vec_splats((uint16_t)n))}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s16x8[0], vec_splats((uint16_t)n))}; }); return result; } @@ -372,7 +362,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(u.s32x4[0], vec_splats((uint32_t)n))}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s32x4[0], vec_splats((uint32_t)n))}; }); return result; } @@ -382,7 +372,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(u.s64x2[0], vec_splats((uint64_t)n))}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s64x2[0], vec_splats((ulong64_t)n))}; }); return result; } @@ -392,7 +382,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sld(u.s8x16[0], (int8x16_t)vec_splat_s8(0), n)}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sld(v->u.s8x16[0], (int8x16_t)vec_splat_s8(0), n)}; }); return result; } @@ -408,7 +398,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_8 (uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(u.s8x16[0], vec_splats((uint8_t)n))}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s8x16[0], vec_splats((uint8_t)n))}; }); return result; } @@ -418,7 +408,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(u.s16x8[0], vec_splats((uint16_t)n))}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s16x8[0], vec_splats((uint16_t)n))}; }); return result; } @@ -428,7 +418,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(u.s32x4[0], vec_splats((uint32_t)n))}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s32x4[0], vec_splats((uint32_t)n))}; }); return result; } @@ -438,7 +428,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(u.s64x2[0], vec_splats((uint64_t)n))}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s64x2[0], vec_splats((ulong64_t)n))}; }); return result; } @@ -448,7 +438,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const UNUSED N) if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sld((int8x16_t)vec_splat_u8(0), u.s8x16[0], 16 - n)}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sld((int8x16_t)vec_splat_u8(0), v->u.s8x16[0], 16 - n)}; }); return result; } @@ -523,14 +513,14 @@ really_inline SuperVector<16> SuperVector<16>::Ones_vshl(uint8_t const N) template <> really_inline SuperVector<16> SuperVector<16>::loadu(void const *ptr) { - return (m128) vec_xl(0, (const int64_t*)ptr); + return (m128) vec_xl(0, (const long64_t*)ptr); } template <> really_inline SuperVector<16> SuperVector<16>::load(void const *ptr) { assert(ISALIGNED_N(ptr, alignof(SuperVector::size))); - return (m128) vec_xl(0, (const int64_t*)ptr); + return (m128) vec_xl(0, (const long64_t*)ptr); } template <> diff --git a/src/util/supervector/arch/ppc64el/types.hpp b/src/util/supervector/arch/ppc64el/types.hpp index dbd863f4..bdc6608e 100644 --- a/src/util/supervector/arch/ppc64el/types.hpp +++ b/src/util/supervector/arch/ppc64el/types.hpp @@ -27,6 +27,18 @@ * POSSIBILITY OF SUCH DAMAGE. */ +typedef __vector unsigned long long int uint64x2_t; +typedef __vector signed long long int int64x2_t; +typedef __vector unsigned int uint32x4_t; +typedef __vector signed int int32x4_t; +typedef __vector unsigned short int uint16x8_t; +typedef __vector signed short int int16x8_t; +typedef __vector unsigned char uint8x16_t; +typedef __vector signed char int8x16_t; + +typedef unsigned long long int ulong64_t; +typedef signed long long int long64_t; + #if !defined(m128) && defined(HAVE_VSX) -typedef __vector int32_t m128; +typedef __vector int m128; #endif From 451d539f1d3e89fe885429aeba4a47b1327cd505 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 2 Dec 2021 18:01:26 +0200 Subject: [PATCH 08/17] Power does not use -march --- CMakeLists.txt | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d61b4a4a..10829fb8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -154,6 +154,12 @@ endif () # Detect best GNUCC_ARCH to tune for if (CMAKE_COMPILER_IS_GNUCC AND NOT CROSS_COMPILE) message(STATUS "gcc version ${CMAKE_C_COMPILER_VERSION}") + + if(ARCH_PPC64EL) + set(ARCH_FLAG mcpu) + else() + set(ARCH_FLAG march) + endif() # If gcc doesn't recognise the host cpu, then mtune=native becomes # generic, which isn't very good in some cases. march=native looks at # cpuid info and then chooses the best microarch it can (and replaces @@ -161,12 +167,12 @@ if (CMAKE_COMPILER_IS_GNUCC AND NOT CROSS_COMPILE) # arg1 might exist if using ccache string (STRIP "${CMAKE_C_COMPILER_ARG1}" CC_ARG1) - set (EXEC_ARGS ${CC_ARG1} -c -Q --help=target -march=native -mtune=native) + set (EXEC_ARGS ${CC_ARG1} -c -Q --help=target -${ARCH_FLAG}=native -mtune=native) execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS} OUTPUT_VARIABLE _GCC_OUTPUT) - string(FIND "${_GCC_OUTPUT}" "march" POS) + string(FIND "${_GCC_OUTPUT}" "${ARCH_FLAG}" POS) string(SUBSTRING "${_GCC_OUTPUT}" ${POS} -1 _GCC_OUTPUT) - string(REGEX REPLACE "march=[ \t]*([^ \n]*)[ \n].*" "\\1" GNUCC_ARCH "${_GCC_OUTPUT}") + string(REGEX REPLACE "${ARCH_FLAG}=[ \t]*([^ \n]*)[ \n].*" "\\1" GNUCC_ARCH "${_GCC_OUTPUT}") # test the parsed flag set (EXEC_ARGS ${CC_ARG1} -E - -mtune=${GNUCC_ARCH}) From 6b364021d190113fec9d770d3d00e9dfb640cee5 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 2 Dec 2021 23:09:34 +0200 Subject: [PATCH 09/17] don't fail if mtune does not return a valid configuration --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 10829fb8..9c58fd46 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -181,12 +181,12 @@ if (CMAKE_COMPILER_IS_GNUCC AND NOT CROSS_COMPILE) INPUT_FILE /dev/null RESULT_VARIABLE GNUCC_TUNE_TEST) if (NOT GNUCC_TUNE_TEST EQUAL 0) - message(SEND_ERROR "Something went wrong determining gcc tune: -mtune=${GNUCC_ARCH} not valid") + message(WARNING "Something went wrong determining gcc tune: -mtune=${GNUCC_ARCH} not valid, falling back to -mtune=native") set(TUNE_FLAG native) else() set(TUNE_FLAG ${GNUCC_ARCH}) + message(STATUS "gcc will tune for ${GNUCC_ARCH}") endif() - message(STATUS "gcc will tune for ${GNUCC_ARCH}") elseif (CMAKE_COMPILER_IS_CLANG AND NOT CROSS_COMPILE) message(STATUS "clang will tune for ${TUNE_FLAG}") if (ARCH_IA32 OR ARCH_X86_64) From 7cad5143662c6b83df86d78e385ec7f04e528a2b Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 2 Dec 2021 23:09:53 +0200 Subject: [PATCH 10/17] clang is more strict --- unit/internal/simd_utils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp index 900078bb..bc2421dc 100644 --- a/unit/internal/simd_utils.cpp +++ b/unit/internal/simd_utils.cpp @@ -667,7 +667,7 @@ TEST(SimdUtilsTest, movq) { simd = _mm_set_epi64x(~0LL, 0x123456789abcdef); #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) int64x2_t a = { 0x123456789abcdefLL, ~0LL }; - simd = vreinterpretq_s64_s8(a); + simd = vreinterpretq_s32_s64(a); #elif defined(ARCH_PPC64EL) int64x2_t a = {0x123456789abcdefLL, ~0LL }; simd = (m128) a; From 07ce6d8e7fb7d900da7d488c854f123a08e534b5 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Fri, 3 Dec 2021 16:24:58 +0200 Subject: [PATCH 11/17] fix build failures with clang on x86, make sure compilation works on other Power as well --- CMakeLists.txt | 98 ++++++++++++++++++++++--------------------- src/util/simd_types.h | 1 + util/CMakeLists.txt | 3 -- 3 files changed, 51 insertions(+), 51 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9c58fd46..3485e5f8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -151,15 +151,16 @@ else () set(ICELAKE_FLAG "-march=icelake-server") endif () +if(ARCH_PPC64EL) + set(ARCH_FLAG mcpu) +else() + set(ARCH_FLAG march) +endif() + # Detect best GNUCC_ARCH to tune for if (CMAKE_COMPILER_IS_GNUCC AND NOT CROSS_COMPILE) message(STATUS "gcc version ${CMAKE_C_COMPILER_VERSION}") - if(ARCH_PPC64EL) - set(ARCH_FLAG mcpu) - else() - set(ARCH_FLAG march) - endif() # If gcc doesn't recognise the host cpu, then mtune=native becomes # generic, which isn't very good in some cases. march=native looks at # cpuid info and then chooses the best microarch it can (and replaces @@ -185,23 +186,12 @@ if (CMAKE_COMPILER_IS_GNUCC AND NOT CROSS_COMPILE) set(TUNE_FLAG native) else() set(TUNE_FLAG ${GNUCC_ARCH}) - message(STATUS "gcc will tune for ${GNUCC_ARCH}") + message(STATUS "gcc will tune for ${GNUCC_ARCH}, ${TUNE_FLAG}") endif() elseif (CMAKE_COMPILER_IS_CLANG AND NOT CROSS_COMPILE) - message(STATUS "clang will tune for ${TUNE_FLAG}") if (ARCH_IA32 OR ARCH_X86_64) set(GNUCC_ARCH native) set(TUNE_FLAG generic) - if (BUILD_AVX512) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SKYLAKE_FLAG}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SKYLAKE_FLAG}") - elseif (BUILD_AVX2) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2") - else() - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse4.2") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2") - endif() elseif(ARCH_AARCH64) set(GNUCC_ARCH armv8) set(TUNE_FLAG generic) @@ -212,11 +202,30 @@ elseif (CMAKE_COMPILER_IS_CLANG AND NOT CROSS_COMPILE) set(GNUCC_ARCH native) set(TUNE_FLAG generic) endif() + message(STATUS "clang will tune for ${GNUCC_ARCH}, ${TUNE_FLAG}") elseif (CROSS_COMPILE) set(GNUCC_ARCH generic) set(TUNE_FLAG generic) endif() +if (ARCH_IA32 OR ARCH_X86_64) + if (NOT FAT_RUNTIME) + if (BUILD_AVX512) + set(ARCH_C_FLAGS "${SKYLAKE_FLAG}") + set(ARCH_CXX_FLAGS "${SKYLAKE_FLAG}") + elseif (BUILD_AVX2) + set(ARCH_C_FLAGS "-mavx2") + set(ARCH_CXX_FLAGS "-mavx2") + else() + set(ARCH_C_FLAGS "-msse4.2") + set(ARCH_CXX_FLAGS "-msse4.2") + endif() + else() + set(ARCH_C_FLAGS "-msse4.2") + set(ARCH_CXX_FLAGS "-msse4.2") + endif() +endif() + if (ARCH_AARCH64) if (BUILD_SVE2_BITPERM) set(GNUCC_ARCH "${GNUCC_ARCH}+sve2-bitperm") @@ -227,23 +236,26 @@ if (ARCH_AARCH64) endif () endif(ARCH_AARCH64) -if (ARCH_IA32 OR ARCH_X86_64 OR ARCH_ARM32 OR ARCH_AARCH64) - if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*) - set(ARCH_C_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}") - endif() - if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*) - set(ARCH_CXX_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}") - endif() -endif() +set(ARCH_C_FLAGS "-${ARCH_FLAG}=${GNUCC_ARCH} -mtune=${TUNE_FLAG} ${ARCH_C_FLAGS}") +set(ARCH_CXX_FLAGS "-${ARCH_FLAG}=${GNUCC_ARCH} -mtune=${TUNE_FLAG} ${ARCH_CXX_FLAGS}") + +#if (ARCH_IA32 OR ARCH_X86_64 OR ARCH_ARM32 OR ARCH_AARCH64) +# if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*) +# set(ARCH_C_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}") +# endif() +# if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*) +# set(ARCH_CXX_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}") +# endif() +#endif() -if(ARCH_PPC64EL) - if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*) - set(ARCH_C_FLAGS "-mtune=${TUNE_FLAG}") - endif() - if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*) - set(ARCH_CXX_FLAGS "-mtune=${TUNE_FLAG}") - endif() -endif() +#if(ARCH_PPC64EL) +# if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*) +# set(ARCH_C_FLAGS "-mtune=${TUNE_FLAG}") +# endif() +# if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*) +# set(ARCH_CXX_FLAGS "-mtune=${TUNE_FLAG}") +# endif() +#endif() # compiler version checks TODO: test more compilers if (CMAKE_COMPILER_IS_GNUCXX) @@ -306,7 +318,6 @@ if (NOT(ARCH_IA32 AND RELEASE_BUILD)) set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fno-omit-frame-pointer") endif() - CHECK_INCLUDE_FILES(unistd.h HAVE_UNISTD_H) if (ARCH_IA32 OR ARCH_X86_64) CHECK_INCLUDE_FILES(intrin.h HAVE_C_INTRIN_H) @@ -474,13 +485,12 @@ if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD") set(FREEBSD true) endif(CMAKE_SYSTEM_NAME MATCHES "FreeBSD") + if (FAT_RUNTIME) if (NOT (ARCH_IA32 OR ARCH_X86_64)) message(FATAL_ERROR "Fat runtime is not supported on non-Intel architectures") else() message(STATUS "Building runtime for multiple microarchitectures") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") endif() else() if (CROSS_COMPILE) @@ -488,9 +498,9 @@ else() else() message(STATUS "Building for current host CPU: ${ARCH_C_FLAGS}") endif() - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ARCH_C_FLAGS}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_CXX_FLAGS}") endif() +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ARCH_C_FLAGS}") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_CXX_FLAGS}") add_subdirectory(util) add_subdirectory(doc/dev-reference) @@ -1207,10 +1217,6 @@ if (NOT FAT_RUNTIME) set_target_properties(hs_runtime PROPERTIES LINKER_LANGUAGE C) add_library(hs_compile OBJECT ${hs_compile_SRCS}) - if (ARCH_IA32) - set_target_properties(hs_compile PROPERTIES COMPILE_FLAGS "-msse4.2") - endif (ARCH_IA32) - add_library(hs STATIC src/hs_version.c src/hs_valid_platform.c @@ -1241,7 +1247,7 @@ else (FAT_RUNTIME) add_library(hs_exec_core2 OBJECT ${hs_exec_SRCS}) list(APPEND RUNTIME_LIBS $) set_target_properties(hs_exec_core2 PROPERTIES - COMPILE_FLAGS "-march=core2" + COMPILE_FLAGS "-march=core2 -msse4.2" RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} core2 ${CMAKE_MODULE_PATH}/keep.syms.in" ) @@ -1290,10 +1296,6 @@ else (FAT_RUNTIME) ${RUNTIME_LIBS}) set_target_properties(hs_runtime PROPERTIES LINKER_LANGUAGE C) add_library(hs_compile OBJECT ${hs_compile_SRCS}) - if (ARCH_IA32 OR ARCH_X86_64) - set_target_properties(hs_exec_common PROPERTIES COMPILE_FLAGS "-msse4.2") - set_target_properties(hs_compile PROPERTIES COMPILE_FLAGS "-msse4.2") - endif () # we want the static lib for testing add_library(hs STATIC src/hs_version.c src/hs_valid_platform.c @@ -1310,7 +1312,7 @@ else (FAT_RUNTIME) add_library(hs_exec_shared_core2 OBJECT ${hs_exec_SRCS}) list(APPEND RUNTIME_SHLIBS $) set_target_properties(hs_exec_shared_core2 PROPERTIES - COMPILE_FLAGS "-march=core2" + COMPILE_FLAGS "-march=core2 -msse4.2" POSITION_INDEPENDENT_CODE TRUE RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} core2 ${CMAKE_MODULE_PATH}/keep.syms.in" ) diff --git a/src/util/simd_types.h b/src/util/simd_types.h index 0deff7e5..4f0fd1a9 100644 --- a/src/util/simd_types.h +++ b/src/util/simd_types.h @@ -51,6 +51,7 @@ typedef struct ALIGN_AVX_DIRECTIVE {m128 lo; m128 hi;} m256; #endif typedef struct {m128 lo; m128 mid; m128 hi;} m384; + #if !defined(m512) && !defined(HAVE_SIMD_512_BITS) typedef struct ALIGN_ATTR(64) {m256 lo; m256 hi;} m512; #endif diff --git a/util/CMakeLists.txt b/util/CMakeLists.txt index 82cee0ff..ea942ef1 100644 --- a/util/CMakeLists.txt +++ b/util/CMakeLists.txt @@ -33,9 +33,6 @@ SET(corpusomatic_SRCS ng_find_matches.cpp ) add_library(corpusomatic STATIC ${corpusomatic_SRCS}) -if (ARCH_IA32 OR ARCH_X86_64) - set_target_properties(corpusomatic PROPERTIES COMPILE_FLAGS "-mssse3") -endif () set(databaseutil_SRCS database_util.cpp From 290eabbca08e7e591ea53cfe3bf37bce5bc7f9fb Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Mon, 6 Dec 2021 18:22:58 +0000 Subject: [PATCH 12/17] fix compilation with clang and some incomplete/wrong implementations for arm this time --- src/util/arch/arm/simd_utils.h | 238 ++++++++++++++++++++++++- src/util/supervector/arch/arm/impl.cpp | 62 +++---- 2 files changed, 264 insertions(+), 36 deletions(-) diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h index 4c68b485..96cd332c 100644 --- a/src/util/arch/arm/simd_utils.h +++ b/src/util/arch/arm/simd_utils.h @@ -122,24 +122,252 @@ m128 sub_2x64(m128 a, m128 b) { return (m128) vsubq_u64((uint64x2_t)a, (uint64x2_t)b); } -static really_really_inline +static really_inline m128 lshift_m128(m128 a, unsigned b) { - return (m128) vshlq_n_u32((uint32x4_t)a, b); +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(b)) { + return (m128) vshlq_n_u32((uint32x4_t)a, b); + } +#endif +#define CASE_LSHIFT_m128(a, offset) case offset: return (m128)vshlq_n_u32((int8x16_t)(a), (offset)); break; + switch (b) { + case 0: return a; break; + CASE_LSHIFT_m128(a, 1); + CASE_LSHIFT_m128(a, 2); + CASE_LSHIFT_m128(a, 3); + CASE_LSHIFT_m128(a, 4); + CASE_LSHIFT_m128(a, 5); + CASE_LSHIFT_m128(a, 6); + CASE_LSHIFT_m128(a, 7); + CASE_LSHIFT_m128(a, 8); + CASE_LSHIFT_m128(a, 9); + CASE_LSHIFT_m128(a, 10); + CASE_LSHIFT_m128(a, 11); + CASE_LSHIFT_m128(a, 12); + CASE_LSHIFT_m128(a, 13); + CASE_LSHIFT_m128(a, 14); + CASE_LSHIFT_m128(a, 15); + CASE_LSHIFT_m128(a, 16); + CASE_LSHIFT_m128(a, 17); + CASE_LSHIFT_m128(a, 18); + CASE_LSHIFT_m128(a, 19); + CASE_LSHIFT_m128(a, 20); + CASE_LSHIFT_m128(a, 21); + CASE_LSHIFT_m128(a, 22); + CASE_LSHIFT_m128(a, 23); + CASE_LSHIFT_m128(a, 24); + CASE_LSHIFT_m128(a, 25); + CASE_LSHIFT_m128(a, 26); + CASE_LSHIFT_m128(a, 27); + CASE_LSHIFT_m128(a, 28); + CASE_LSHIFT_m128(a, 29); + CASE_LSHIFT_m128(a, 30); + CASE_LSHIFT_m128(a, 31); + default: return zeroes128(); break; + } +#undef CASE_LSHIFT_m128 } static really_really_inline m128 rshift_m128(m128 a, unsigned b) { - return (m128) vshrq_n_u32((uint32x4_t)a, b); +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(b)) { + return (m128) vshrq_n_u32((uint32x4_t)a, b); + } +#endif +#define CASE_RSHIFT_m128(a, offset) case offset: return (m128)vshrq_n_u32((int8x16_t)(a), (offset)); break; + switch (b) { + case 0: return a; break; + CASE_RSHIFT_m128(a, 1); + CASE_RSHIFT_m128(a, 2); + CASE_RSHIFT_m128(a, 3); + CASE_RSHIFT_m128(a, 4); + CASE_RSHIFT_m128(a, 5); + CASE_RSHIFT_m128(a, 6); + CASE_RSHIFT_m128(a, 7); + CASE_RSHIFT_m128(a, 8); + CASE_RSHIFT_m128(a, 9); + CASE_RSHIFT_m128(a, 10); + CASE_RSHIFT_m128(a, 11); + CASE_RSHIFT_m128(a, 12); + CASE_RSHIFT_m128(a, 13); + CASE_RSHIFT_m128(a, 14); + CASE_RSHIFT_m128(a, 15); + CASE_RSHIFT_m128(a, 16); + CASE_RSHIFT_m128(a, 17); + CASE_RSHIFT_m128(a, 18); + CASE_RSHIFT_m128(a, 19); + CASE_RSHIFT_m128(a, 20); + CASE_RSHIFT_m128(a, 21); + CASE_RSHIFT_m128(a, 22); + CASE_RSHIFT_m128(a, 23); + CASE_RSHIFT_m128(a, 24); + CASE_RSHIFT_m128(a, 25); + CASE_RSHIFT_m128(a, 26); + CASE_RSHIFT_m128(a, 27); + CASE_RSHIFT_m128(a, 28); + CASE_RSHIFT_m128(a, 29); + CASE_RSHIFT_m128(a, 30); + CASE_RSHIFT_m128(a, 31); + default: return zeroes128(); break; + } +#undef CASE_RSHIFT_m128 } static really_really_inline m128 lshift64_m128(m128 a, unsigned b) { - return (m128) vshlq_n_u64((uint64x2_t)a, b); +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(b)) { + return (m128) vshlq_n_u64((uint64x2_t)a, b); + } +#endif +#define CASE_LSHIFT64_m128(a, offset) case offset: return (m128)vshlq_n_u64((int8x16_t)(a), (offset)); break; + switch (b) { + case 0: return a; break; + CASE_LSHIFT64_m128(a, 1); + CASE_LSHIFT64_m128(a, 2); + CASE_LSHIFT64_m128(a, 3); + CASE_LSHIFT64_m128(a, 4); + CASE_LSHIFT64_m128(a, 5); + CASE_LSHIFT64_m128(a, 6); + CASE_LSHIFT64_m128(a, 7); + CASE_LSHIFT64_m128(a, 8); + CASE_LSHIFT64_m128(a, 9); + CASE_LSHIFT64_m128(a, 10); + CASE_LSHIFT64_m128(a, 11); + CASE_LSHIFT64_m128(a, 12); + CASE_LSHIFT64_m128(a, 13); + CASE_LSHIFT64_m128(a, 14); + CASE_LSHIFT64_m128(a, 15); + CASE_LSHIFT64_m128(a, 16); + CASE_LSHIFT64_m128(a, 17); + CASE_LSHIFT64_m128(a, 18); + CASE_LSHIFT64_m128(a, 19); + CASE_LSHIFT64_m128(a, 20); + CASE_LSHIFT64_m128(a, 21); + CASE_LSHIFT64_m128(a, 22); + CASE_LSHIFT64_m128(a, 23); + CASE_LSHIFT64_m128(a, 24); + CASE_LSHIFT64_m128(a, 25); + CASE_LSHIFT64_m128(a, 26); + CASE_LSHIFT64_m128(a, 27); + CASE_LSHIFT64_m128(a, 28); + CASE_LSHIFT64_m128(a, 29); + CASE_LSHIFT64_m128(a, 30); + CASE_LSHIFT64_m128(a, 31); + CASE_LSHIFT64_m128(a, 32); + CASE_LSHIFT64_m128(a, 33); + CASE_LSHIFT64_m128(a, 34); + CASE_LSHIFT64_m128(a, 35); + CASE_LSHIFT64_m128(a, 36); + CASE_LSHIFT64_m128(a, 37); + CASE_LSHIFT64_m128(a, 38); + CASE_LSHIFT64_m128(a, 39); + CASE_LSHIFT64_m128(a, 40); + CASE_LSHIFT64_m128(a, 41); + CASE_LSHIFT64_m128(a, 42); + CASE_LSHIFT64_m128(a, 43); + CASE_LSHIFT64_m128(a, 44); + CASE_LSHIFT64_m128(a, 45); + CASE_LSHIFT64_m128(a, 46); + CASE_LSHIFT64_m128(a, 47); + CASE_LSHIFT64_m128(a, 48); + CASE_LSHIFT64_m128(a, 49); + CASE_LSHIFT64_m128(a, 50); + CASE_LSHIFT64_m128(a, 51); + CASE_LSHIFT64_m128(a, 52); + CASE_LSHIFT64_m128(a, 53); + CASE_LSHIFT64_m128(a, 54); + CASE_LSHIFT64_m128(a, 55); + CASE_LSHIFT64_m128(a, 56); + CASE_LSHIFT64_m128(a, 57); + CASE_LSHIFT64_m128(a, 58); + CASE_LSHIFT64_m128(a, 59); + CASE_LSHIFT64_m128(a, 60); + CASE_LSHIFT64_m128(a, 61); + CASE_LSHIFT64_m128(a, 62); + CASE_LSHIFT64_m128(a, 63); + default: return zeroes128(); break; + } +#undef CASE_LSHIFT64_m128 } static really_really_inline m128 rshift64_m128(m128 a, unsigned b) { - return (m128) vshrq_n_u64((uint64x2_t)a, b); +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(b)) { + return (m128) vshrq_n_u64((uint64x2_t)a, b); + } +#endif +#define CASE_RSHIFT64_m128(a, offset) case offset: return (m128)vshrq_n_u64((int8x16_t)(a), (offset)); break; + switch (b) { + case 0: return a; break; + CASE_RSHIFT64_m128(a, 1); + CASE_RSHIFT64_m128(a, 2); + CASE_RSHIFT64_m128(a, 3); + CASE_RSHIFT64_m128(a, 4); + CASE_RSHIFT64_m128(a, 5); + CASE_RSHIFT64_m128(a, 6); + CASE_RSHIFT64_m128(a, 7); + CASE_RSHIFT64_m128(a, 8); + CASE_RSHIFT64_m128(a, 9); + CASE_RSHIFT64_m128(a, 10); + CASE_RSHIFT64_m128(a, 11); + CASE_RSHIFT64_m128(a, 12); + CASE_RSHIFT64_m128(a, 13); + CASE_RSHIFT64_m128(a, 14); + CASE_RSHIFT64_m128(a, 15); + CASE_RSHIFT64_m128(a, 16); + CASE_RSHIFT64_m128(a, 17); + CASE_RSHIFT64_m128(a, 18); + CASE_RSHIFT64_m128(a, 19); + CASE_RSHIFT64_m128(a, 20); + CASE_RSHIFT64_m128(a, 21); + CASE_RSHIFT64_m128(a, 22); + CASE_RSHIFT64_m128(a, 23); + CASE_RSHIFT64_m128(a, 24); + CASE_RSHIFT64_m128(a, 25); + CASE_RSHIFT64_m128(a, 26); + CASE_RSHIFT64_m128(a, 27); + CASE_RSHIFT64_m128(a, 28); + CASE_RSHIFT64_m128(a, 29); + CASE_RSHIFT64_m128(a, 30); + CASE_RSHIFT64_m128(a, 31); + CASE_RSHIFT64_m128(a, 32); + CASE_RSHIFT64_m128(a, 33); + CASE_RSHIFT64_m128(a, 34); + CASE_RSHIFT64_m128(a, 35); + CASE_RSHIFT64_m128(a, 36); + CASE_RSHIFT64_m128(a, 37); + CASE_RSHIFT64_m128(a, 38); + CASE_RSHIFT64_m128(a, 39); + CASE_RSHIFT64_m128(a, 40); + CASE_RSHIFT64_m128(a, 41); + CASE_RSHIFT64_m128(a, 42); + CASE_RSHIFT64_m128(a, 43); + CASE_RSHIFT64_m128(a, 44); + CASE_RSHIFT64_m128(a, 45); + CASE_RSHIFT64_m128(a, 46); + CASE_RSHIFT64_m128(a, 47); + CASE_RSHIFT64_m128(a, 48); + CASE_RSHIFT64_m128(a, 49); + CASE_RSHIFT64_m128(a, 50); + CASE_RSHIFT64_m128(a, 51); + CASE_RSHIFT64_m128(a, 52); + CASE_RSHIFT64_m128(a, 53); + CASE_RSHIFT64_m128(a, 54); + CASE_RSHIFT64_m128(a, 55); + CASE_RSHIFT64_m128(a, 56); + CASE_RSHIFT64_m128(a, 57); + CASE_RSHIFT64_m128(a, 58); + CASE_RSHIFT64_m128(a, 59); + CASE_RSHIFT64_m128(a, 60); + CASE_RSHIFT64_m128(a, 61); + CASE_RSHIFT64_m128(a, 62); + CASE_RSHIFT64_m128(a, 63); + default: return zeroes128(); break; + } +#undef CASE_RSHIFT64_m128 } static really_inline m128 eq128(m128 a, m128 b) { diff --git a/src/util/supervector/arch/arm/impl.cpp b/src/util/supervector/arch/arm/impl.cpp index 980f0b39..ff1149a9 100644 --- a/src/util/supervector/arch/arm/impl.cpp +++ b/src/util/supervector/arch/arm/impl.cpp @@ -45,112 +45,112 @@ really_inline SuperVector<16>::SuperVector(typename base_type::type const v) template<> template<> -really_inline SuperVector<16>::SuperVector(int8x16_t other) +really_inline SuperVector<16>::SuperVector(int8x16_t other) { u.s8x16[0] = other; } template<> template<> -really_inline SuperVector<16>::SuperVector(uint8x16_t other) +really_inline SuperVector<16>::SuperVector(uint8x16_t other) { u.u8x16[0] = other; } template<> template<> -really_inline SuperVector<16>::SuperVector(int16x8_t other) +really_inline SuperVector<16>::SuperVector(int16x8_t other) { u.s16x8[0] = other; } template<> template<> -really_inline SuperVector<16>::SuperVector(uint16x8_t other) +really_inline SuperVector<16>::SuperVector(uint16x8_t other) { u.u16x8[0] = other; } template<> template<> -really_inline SuperVector<16>::SuperVector(int32x4_t other) +really_inline SuperVector<16>::SuperVector(int32x4_t other) { u.s32x4[0] = other; } template<> template<> -really_inline SuperVector<16>::SuperVector(uint32x4_t other) +really_inline SuperVector<16>::SuperVector(uint32x4_t other) { u.u32x4[0] = other; } template<> template<> -really_inline SuperVector<16>::SuperVector(int64x2_t other) +really_inline SuperVector<16>::SuperVector(int64x2_t other) { u.s64x2[0] = other; } template<> template<> -really_inline SuperVector<16>::SuperVector(uint64x2_t other) +really_inline SuperVector<16>::SuperVector(uint64x2_t other) { u.u64x2[0] = other; } template<> template<> -really_inline SuperVector<16>::SuperVector(int8_t const other) +really_inline SuperVector<16>::SuperVector(int8_t const other) { u.s8x16[0] = vdupq_n_s8(other); } template<> template<> -really_inline SuperVector<16>::SuperVector(uint8_t const other) +really_inline SuperVector<16>::SuperVector(uint8_t const other) { u.u8x16[0] = vdupq_n_u8(other); } template<> template<> -really_inline SuperVector<16>::SuperVector(int16_t const other) +really_inline SuperVector<16>::SuperVector(int16_t const other) { u.s16x8[0] = vdupq_n_s16(other); } template<> template<> -really_inline SuperVector<16>::SuperVector(uint16_t const other) +really_inline SuperVector<16>::SuperVector(uint16_t const other) { u.u16x8[0] = vdupq_n_u16(other); } template<> template<> -really_inline SuperVector<16>::SuperVector(int32_t const other) +really_inline SuperVector<16>::SuperVector(int32_t const other) { u.s32x4[0] = vdupq_n_s32(other); } template<> template<> -really_inline SuperVector<16>::SuperVector(uint32_t const other) +really_inline SuperVector<16>::SuperVector(uint32_t const other) { u.u32x4[0] = vdupq_n_u32(other); } template<> template<> -really_inline SuperVector<16>::SuperVector(int64_t const other) +really_inline SuperVector<16>::SuperVector(int64_t const other) { u.s64x2[0] = vdupq_n_s64(other); } template<> template<> -really_inline SuperVector<16>::SuperVector(uint64_t const other) +really_inline SuperVector<16>::SuperVector(uint64_t const other) { u.u64x2[0] = vdupq_n_u64(other); } @@ -376,7 +376,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_8 (uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u8(u.u8x16[0], n)}; }); + Unroller<1, 8>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u8(v->u.u8x16[0], n)}; }); return result; } @@ -386,7 +386,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u16(u.u16x8[0], n)}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u16(v->u.u16x8[0], n)}; }); return result; } @@ -394,9 +394,9 @@ template <> really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const { if (N == 0) return *this; - if (N == 16) return Zeroes(); + if (N == 32) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u32(u.u32x4[0], n)}; }); + Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u32(v->u.u32x4[0], n)}; }); return result; } @@ -404,9 +404,9 @@ template <> really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const { if (N == 0) return *this; - if (N == 16) return Zeroes(); + if (N == 64) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u64(u.u64x2[0], n)}; }); + Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u64(v->u.u64x2[0], n)}; }); return result; } @@ -416,7 +416,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_u8(vdupq_n_u8(0), u.u8x16[0], 16 - n)}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_u8(vdupq_n_u8(0), v->u.u8x16[0], 16 - n)}; }); return result; } @@ -430,9 +430,9 @@ template <> really_inline SuperVector<16> SuperVector<16>::vshr_8 (uint8_t const N) const { if (N == 0) return *this; - if (N == 16) return Zeroes(); + if (N == 8) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u8(u.u8x16[0], n)}; }); + Unroller<1, 8>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u8(v->u.u8x16[0], n)}; }); return result; } @@ -442,7 +442,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u16(u.u16x8[0], n)}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u16(v->u.u16x8[0], n)}; }); return result; } @@ -450,9 +450,9 @@ template <> really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const { if (N == 0) return *this; - if (N == 16) return Zeroes(); + if (N == 32) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u32(u.u32x4[0], n)}; }); + Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u32(v->u.u32x4[0], n)}; }); return result; } @@ -460,9 +460,9 @@ template <> really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const { if (N == 0) return *this; - if (N == 16) return Zeroes(); + if (N == 64) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u64(u.u64x2[0], n)}; }); + Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u64(v->u.u64x2[0], n)}; }); return result; } @@ -472,7 +472,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const N) const if (N == 0) return *this; if (N == 16) return Zeroes(); SuperVector result; - Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_u8(u.u8x16[0], vdupq_n_u8(0), n)}; }); + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_u8(v->u.u8x16[0], vdupq_n_u8(0), n)}; }); return result; } From d3f0d8dd704a5500be641b693dcf1e361ec59f47 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Mon, 6 Dec 2021 18:38:01 +0000 Subject: [PATCH 13/17] update Jenkinsfile for all configurations --- Jenkinsfile | 606 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 587 insertions(+), 19 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 1883f43a..3dbef5b6 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,22 +1,590 @@ pipeline { - agent { - node { - label 'x86' - } - - } - stages { - stage('Release, SSE') { - agent { - node { - label 'x86' + agent none + stages { + stage("Build") { + failFast true + parallel { + stage("Release/SSE") { + agent { label "x86" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-release-SSE', buildType: 'Release', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=no -DBUILD_AVX512=no -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-release-SSE/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-release-SSE/bin/unit-hyperscan' + } + } + } + } + stage("Release/AVX2") { + agent { label "x86" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-release-AVX2', buildType: 'Release', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=no -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-release-AVX2/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-release-AVX2/bin/unit-hyperscan' + } + } + } + } + stage("Release/AVX512") { + agent { label "x86" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-release-AVX512', buildType: 'Release', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=yes -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-release-AVX512/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-release-AVX512/bin/unit-hyperscan' + } + } + } + } + stage("Release/FAT") { + agent { label "x86" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-release-fat', buildType: 'Release', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=yes -DFAT_RUNTIME=yes', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]] + } + } + stage("Test") { + steps { + sh 'build-release-fat/bin/unit-hyperscan' + } + } + } + } + stage("Debug/SSE") { + agent { label "x86" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-debug-SSE', buildType: 'Debug', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=no -DBUILD_AVX512=no -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-debug-SSE/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-debug-SSE/bin/unit-hyperscan' + } + } + } + } + stage("Debug/AVX2") { + agent { label "x86" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-debug-AVX2', buildType: 'Debug', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=no -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-debug-AVX2/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-debug-AVX2/bin/unit-hyperscan' + } + } + } + } + stage("Debug/AVX512") { + agent { label "x86" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-debug-AVX512', buildType: 'Debug', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=yes -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-debug-AVX512/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-debug-AVX512/bin/unit-hyperscan' + } + } + } + } + stage("Debug/FAT") { + agent { label "x86" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-debug-fat', buildType: 'Debug', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=yes -DFAT_RUNTIME=yes', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]] + } + } + stage("Test") { + steps { + sh 'build-debug-fat/bin/unit-hyperscan' + } + } + } + } + stage("Release/ARM") { + agent { label "arm" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-release-arm', buildType: 'Release', cleanBuild: true, cmakeArgs: '', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-release-arm/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-release-arm/bin/unit-hyperscan' + } + } + } + } + stage("Debug/ARM") { + agent { label "arm" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-debug-arm', buildType: 'Debug', cleanBuild: true, cmakeArgs: '', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-debug-arm/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-debug-arm/bin/unit-hyperscan' + } + } + } + } + stage("Release/Power") { + agent { label "power" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-release-power', buildType: 'Release', cleanBuild: true, cmakeArgs: '', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-release-power/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-release-power/bin/unit-hyperscan' + } + } + } + } + stage("Debug/Power") { + agent { label "power" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-debug-power', buildType: 'Debug', cleanBuild: true, cmakeArgs: '', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-debug-power/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-debug-power/bin/unit-hyperscan' + } + } + } + } + stage("Clang-Release/SSE") { + agent { label "x86" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-clang-release-SSE', buildType: 'Release', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=no -DBUILD_AVX512=no -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-clang-release-SSE/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-clang-release-SSE/bin/unit-hyperscan' + } + } + } + } + stage("Clang-Release/AVX2") { + agent { label "x86" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-clang-release-AVX2', buildType: 'Release', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=no -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-clang-release-AVX2/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-clang-release-AVX2/bin/unit-hyperscan' + } + } + } + } + stage("Clang-Release/AVX512") { + agent { label "x86" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-clang-release-AVX512', buildType: 'Release', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=yes -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-clang-release-AVX512/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-clang-release-AVX512/bin/unit-hyperscan' + } + } + } + } + stage("Clang-Release/FAT") { + agent { label "x86" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-clang-release-fat', buildType: 'Release', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=yes -DFAT_RUNTIME=yes', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]] + } + } + stage("Test") { + steps { + sh 'build-clang-release-fat/bin/unit-hyperscan' + } + } + } + } + stage("Clang-Debug/SSE") { + agent { label "x86" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-clang-debug-SSE', buildType: 'Debug', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=no -DBUILD_AVX512=no -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-clang-debug-SSE/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-clang-debug-SSE/bin/unit-hyperscan' + } + } + } + } + stage("Clang-Debug/AVX2") { + agent { label "x86" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-clang-debug-AVX2', buildType: 'Debug', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=no -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-clang-debug-AVX2/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-clang-debug-AVX2/bin/unit-hyperscan' + } + } + } + } + stage("Clang-Debug/AVX512") { + agent { label "x86" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-clang-debug-AVX512', buildType: 'Debug', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=yes -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-clang-debug-AVX512/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-clang-debug-AVX512/bin/unit-hyperscan' + } + } + } + } + stage("Clang-Debug/FAT") { + agent { label "x86" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-clang-debug-fat', buildType: 'Debug', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=yes -DFAT_RUNTIME=yes', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]] + } + } + stage("Test") { + steps { + sh 'build-clang-debug-fat/bin/unit-hyperscan' + } + } + } + } + stage("Clang-Release/ARM") { + agent { label "arm" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-clang-release-arm', buildType: 'Release', cleanBuild: true, cmakeArgs: '', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-clang-release-arm/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-clang-release-arm/bin/unit-hyperscan' + } + } + } + } + stage("Clang-Debug/ARM") { + agent { label "arm" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-debug-arm', buildType: 'Debug', cleanBuild: true, cmakeArgs: '', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-clang-debug-arm/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-clang-debug-arm/bin/unit-hyperscan' + } + } + } + } + stage("Clang-Release/Power") { + agent { label "power" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-clang-release-power', buildType: 'Release', cleanBuild: true, cmakeArgs: '', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-clang-release-power/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-clang-release-power/bin/unit-hyperscan' + } + } + } + } + stage("Clang-Debug/Power") { + agent { label "power" } + stages { + stage("Git checkout") { + steps { + checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]]) + } + } + stage("Build") { + steps { + cmakeBuild buildDir: 'build-clang-debug-power', buildType: 'Debug', cleanBuild: true, cmakeArgs: '', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]] + } + } + stage("Unit Test") { + steps { + sh 'build-clang-debug-power/bin/unit-internal' + } + } + stage("Test") { + steps { + sh 'build-clang-debug-power/bin/unit-hyperscan' + } + } + } + } + } } - - } - steps { - sh 'mkdir build-release-SSE && cmake -DCMAKE_BUILD_TYPE=Release -C build-release-SSE' - } } - - } -} \ No newline at end of file +} From deeb113977af4ef2fb72c6c7551cf56d19be3291 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Mon, 6 Dec 2021 21:35:37 +0000 Subject: [PATCH 14/17] lower gcc minver to 9 to enable building on Ubuntu 20 LTS --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3485e5f8..76bca813 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -259,7 +259,7 @@ set(ARCH_CXX_FLAGS "-${ARCH_FLAG}=${GNUCC_ARCH} -mtune=${TUNE_FLAG} ${ARCH_CXX_F # compiler version checks TODO: test more compilers if (CMAKE_COMPILER_IS_GNUCXX) - set(GNUCXX_MINVER "10") + set(GNUCXX_MINVER "9") message(STATUS "g++ version ${CMAKE_CXX_COMPILER_VERSION}") if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS GNUCXX_MINVER) message(FATAL_ERROR "A minimum of g++ ${GNUCXX_MINVER} is required for C++17 support") From fec557c1f9ca7d9eae4ca6a3e419a50bef674a06 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Mon, 6 Dec 2021 21:35:51 +0000 Subject: [PATCH 15/17] fix wrong castings for NEON --- src/util/arch/arm/simd_utils.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h index 96cd332c..d1ab583f 100644 --- a/src/util/arch/arm/simd_utils.h +++ b/src/util/arch/arm/simd_utils.h @@ -129,7 +129,7 @@ m128 lshift_m128(m128 a, unsigned b) { return (m128) vshlq_n_u32((uint32x4_t)a, b); } #endif -#define CASE_LSHIFT_m128(a, offset) case offset: return (m128)vshlq_n_u32((int8x16_t)(a), (offset)); break; +#define CASE_LSHIFT_m128(a, offset) case offset: return (m128)vshlq_n_u32((uint32x4_t)(a), (offset)); break; switch (b) { case 0: return a; break; CASE_LSHIFT_m128(a, 1); @@ -175,7 +175,7 @@ m128 rshift_m128(m128 a, unsigned b) { return (m128) vshrq_n_u32((uint32x4_t)a, b); } #endif -#define CASE_RSHIFT_m128(a, offset) case offset: return (m128)vshrq_n_u32((int8x16_t)(a), (offset)); break; +#define CASE_RSHIFT_m128(a, offset) case offset: return (m128)vshrq_n_u32((uint32x4_t)(a), (offset)); break; switch (b) { case 0: return a; break; CASE_RSHIFT_m128(a, 1); @@ -221,7 +221,7 @@ m128 lshift64_m128(m128 a, unsigned b) { return (m128) vshlq_n_u64((uint64x2_t)a, b); } #endif -#define CASE_LSHIFT64_m128(a, offset) case offset: return (m128)vshlq_n_u64((int8x16_t)(a), (offset)); break; +#define CASE_LSHIFT64_m128(a, offset) case offset: return (m128)vshlq_n_u64((uint64x2_t)(a), (offset)); break; switch (b) { case 0: return a; break; CASE_LSHIFT64_m128(a, 1); @@ -299,7 +299,7 @@ m128 rshift64_m128(m128 a, unsigned b) { return (m128) vshrq_n_u64((uint64x2_t)a, b); } #endif -#define CASE_RSHIFT64_m128(a, offset) case offset: return (m128)vshrq_n_u64((int8x16_t)(a), (offset)); break; +#define CASE_RSHIFT64_m128(a, offset) case offset: return (m128)vshrq_n_u64((uint64x2_t)(a), (offset)); break; switch (b) { case 0: return a; break; CASE_RSHIFT64_m128(a, 1); From fd2eabd0716477e29008da6772c499b855f6d48c Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Tue, 7 Dec 2021 08:43:52 +0000 Subject: [PATCH 16/17] fix clang-release-arm compilation --- src/util/arch/arm/simd_utils.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h index d1ab583f..764d26fd 100644 --- a/src/util/arch/arm/simd_utils.h +++ b/src/util/arch/arm/simd_utils.h @@ -419,9 +419,10 @@ m128 load_m128_from_u64a(const u64a *p) { } static really_inline u32 extract32from128(const m128 in, unsigned imm) { -#if defined(HS_OPTIMIZE) - return vgetq_lane_u32((uint32x4_t) in, imm); -#else +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(b)) { + return vgetq_lane_u32((uint32x4_t) in, imm); +#endif switch (imm) { case 0: return vgetq_lane_u32((uint32x4_t) in, 0); @@ -439,13 +440,13 @@ static really_inline u32 extract32from128(const m128 in, unsigned imm) { return 0; break; } -#endif } static really_inline u64a extract64from128(const m128 in, unsigned imm) { -#if defined(HS_OPTIMIZE) - return vgetq_lane_u64((uint64x2_t) in, imm); -#else +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(b)) { + return vgetq_lane_u64((uint64x2_t) in, imm); +#endif switch (imm) { case 0: return vgetq_lane_u64((uint64x2_t) in, 0); @@ -457,7 +458,6 @@ static really_inline u64a extract64from128(const m128 in, unsigned imm) { return 0; break; } -#endif } static really_inline m128 low64from128(const m128 in) { From 4589f1742e1ef24ea8e87a56a477e76a56358968 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Tue, 7 Dec 2021 08:49:59 +0000 Subject: [PATCH 17/17] minor fixes --- src/util/arch/arm/simd_utils.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h index 764d26fd..902d3624 100644 --- a/src/util/arch/arm/simd_utils.h +++ b/src/util/arch/arm/simd_utils.h @@ -420,8 +420,9 @@ m128 load_m128_from_u64a(const u64a *p) { static really_inline u32 extract32from128(const m128 in, unsigned imm) { #if defined(HAVE__BUILTIN_CONSTANT_P) - if (__builtin_constant_p(b)) { + if (__builtin_constant_p(imm)) { return vgetq_lane_u32((uint32x4_t) in, imm); + } #endif switch (imm) { case 0: @@ -444,8 +445,9 @@ static really_inline u32 extract32from128(const m128 in, unsigned imm) { static really_inline u64a extract64from128(const m128 in, unsigned imm) { #if defined(HAVE__BUILTIN_CONSTANT_P) - if (__builtin_constant_p(b)) { + if (__builtin_constant_p(imm)) { return vgetq_lane_u64((uint64x2_t) in, imm); + } #endif switch (imm) { case 0: