From 404a0ab0f4ea80a012b01dcce2d4a7bc12d4c821 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Wed, 1 Dec 2021 23:18:57 +0200
Subject: [PATCH 01/17] fix miscompilation with clang

---
 cmake/platform.cmake | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/cmake/platform.cmake b/cmake/platform.cmake
index 2cdc3a6e..5a2b85b2 100644
--- a/cmake/platform.cmake
+++ b/cmake/platform.cmake
@@ -1,3 +1,8 @@
+# determine compiler
+if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  set(CMAKE_COMPILER_IS_CLANG TRUE)
+endif()
+
 # determine the target arch
 
 if (CROSS_COMPILE_AARCH64)
@@ -10,7 +15,7 @@ else()
   CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_IA32)
   CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_A64)\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_AARCH64)
   CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_ARM)\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_ARM32)
-  CHECK_C_SOURCE_COMPILES("#if !defined(__PPC64__) && !defined(__LITTLE_ENDIAN__) && !defined(__VSX__)\n#error not ppc64el\n#endif\nint main(void) { return 0; }" ARCH_PPC64EL)
+  CHECK_C_SOURCE_COMPILES("#if !defined(__PPC64__) && !(defined(__LITTLE_ENDIAN__) && defined(__VSX__))\n#error not ppc64el\n#endif\nint main(void) { return 0; }" ARCH_PPC64EL)
   if (ARCH_X86_64 OR ARCH_AARCH64 OR ARCH_PPC64EL)
     set(ARCH_64_BIT TRUE)
   else()

From 7d600c4fcbb0c85f3082f164d969c245fc0a71d5 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Wed, 1 Dec 2021 23:19:43 +0200
Subject: [PATCH 02/17] bump base requirements to SSE4.2

---
 cmake/arch.cmake               | 14 +++++++-------
 src/util/arch/x86/simd_types.h |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/cmake/arch.cmake b/cmake/arch.cmake
index 2100799f..29c39b49 100644
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@@ -88,7 +88,7 @@ if (FAT_RUNTIME)
             set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} ${SKYLAKE_FLAG}")
         endif (BUILD_AVX512VBMI)
     elseif (BUILD_AVX2)
-        set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} -march=core-avx2 -mavx")
+        set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} -march=core-avx2 -mavx2")
     elseif ()
         set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} -march=core-i7 -mssse3")
     endif ()
@@ -98,12 +98,12 @@ else (NOT FAT_RUNTIME)
 endif ()
 
 if (ARCH_IA32 OR ARCH_X86_64)
-    # ensure we have the minimum of SSSE3 - call a SSSE3 intrinsic
+    # ensure we have the minimum of SSE4.2 - call a SSE4.2 intrinsic
     CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
 int main() {
     __m128i a = _mm_set1_epi8(1);
     (void)_mm_shuffle_epi8(a, a);
-}" HAVE_SSSE3)
+}" HAVE_SSE42)
 
     # now look for AVX2
     CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
@@ -157,8 +157,8 @@ else ()
 endif ()
 
 if (FAT_RUNTIME)
-    if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_SSSE3)
-        message(FATAL_ERROR "SSSE3 support required to build fat runtime")
+    if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_SSE42)
+        message(FATAL_ERROR "SSE4.2 support required to build fat runtime")
     endif ()
     if ((ARCH_IA32 OR ARCH_X86_64) AND BUILD_AVX2 AND NOT HAVE_AVX2)
         message(FATAL_ERROR "AVX2 support required to build fat runtime")
@@ -179,8 +179,8 @@ else (NOT FAT_RUNTIME)
     if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_AVX512VBMI)
         message(STATUS "Building without AVX512VBMI support")
     endif ()
-    if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_SSSE3)
-        message(FATAL_ERROR "A minimum of SSSE3 compiler support is required")
+    if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_SSE42)
+        message(FATAL_ERROR "A minimum of SSE4.2 compiler support is required")
     endif ()
     if ((ARCH_ARM32 OR ARCH_AARCH64) AND NOT HAVE_NEON)
         message(FATAL_ERROR "NEON support required for ARM support")
diff --git a/src/util/arch/x86/simd_types.h b/src/util/arch/x86/simd_types.h
index c04e8dab..e1642404 100644
--- a/src/util/arch/x86/simd_types.h
+++ b/src/util/arch/x86/simd_types.h
@@ -30,7 +30,7 @@
 #ifndef SIMD_TYPES_X86_H
 #define SIMD_TYPES_X86_H
 
-#if !defined(m128) && defined(HAVE_SSE2)
+#if !defined(m128) && defined(HAVE_SSE42)
 typedef __m128i m128;
 #endif
 

From 0221dc1771716b50ec601cc21e9e769e184b9be2 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Wed, 1 Dec 2021 23:22:15 +0200
Subject: [PATCH 03/17] fix misompilations with clang++, as it is more strict

---
 src/util/supervector/arch/x86/impl.cpp | 54 +++++++++++++-------------
 src/util/supervector/supervector.hpp   |  6 +--
 2 files changed, 29 insertions(+), 31 deletions(-)

diff --git a/src/util/supervector/arch/x86/impl.cpp b/src/util/supervector/arch/x86/impl.cpp
index b7686220..157f1dc4 100644
--- a/src/util/supervector/arch/x86/impl.cpp
+++ b/src/util/supervector/arch/x86/impl.cpp
@@ -55,56 +55,56 @@ really_inline SuperVector<16>::SuperVector(typename base_type::type const v)
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<int8_t>(int8_t const other)
+really_inline SuperVector<16>::SuperVector(int8_t const other)
 {
     u.v128[0] = _mm_set1_epi8(other);
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<uint8_t>(uint8_t const other)
+really_inline SuperVector<16>::SuperVector(uint8_t const other)
 {
     u.v128[0] = _mm_set1_epi8(static_cast<int8_t>(other));
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<int16_t>(int16_t const other)
+really_inline SuperVector<16>::SuperVector(int16_t const other)
 {
     u.v128[0] = _mm_set1_epi16(other);
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<uint16_t>(uint16_t const other)
+really_inline SuperVector<16>::SuperVector(uint16_t const other)
 {
     u.v128[0] = _mm_set1_epi16(static_cast<int16_t>(other));
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<int32_t>(int32_t const other)
+really_inline SuperVector<16>::SuperVector(int32_t const other)
 {
     u.v128[0] = _mm_set1_epi32(other);
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<uint32_t>(uint32_t const other)
+really_inline SuperVector<16>::SuperVector(uint32_t const other)
 {
     u.v128[0] = _mm_set1_epi32(static_cast<int32_t>(other));
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<int64_t>(int64_t const other)
+really_inline SuperVector<16>::SuperVector(int64_t const other)
 {
     u.v128[0] = _mm_set1_epi64x(other);
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<uint64_t>(uint64_t const other)
+really_inline SuperVector<16>::SuperVector(uint64_t const other)
 {
     u.v128[0] = _mm_set1_epi64x(static_cast<int64_t>(other));
 }
@@ -608,56 +608,56 @@ really_inline SuperVector<32>::SuperVector(SuperVector<16> const lo, SuperVector
 
 template<>
 template<>
-really_inline SuperVector<32>::SuperVector<int8_t>(int8_t const other)
+really_inline SuperVector<32>::SuperVector(int8_t const other)
 {
     u.v256[0] = _mm256_set1_epi8(other);
 }
 
 template<>
 template<>
-really_inline SuperVector<32>::SuperVector<uint8_t>(uint8_t const other)
+really_inline SuperVector<32>::SuperVector(uint8_t const other)
 {
     u.v256[0] = _mm256_set1_epi8(static_cast<int8_t>(other));
 }
 
 template<>
 template<>
-really_inline SuperVector<32>::SuperVector<int16_t>(int16_t const other)
+really_inline SuperVector<32>::SuperVector(int16_t const other)
 {
     u.v256[0] = _mm256_set1_epi16(other);
 }
 
 template<>
 template<>
-really_inline SuperVector<32>::SuperVector<uint16_t>(uint16_t const other)
+really_inline SuperVector<32>::SuperVector(uint16_t const other)
 {
     u.v256[0] = _mm256_set1_epi16(static_cast<int16_t>(other));
 }
 
 template<>
 template<>
-really_inline SuperVector<32>::SuperVector<int32_t>(int32_t const other)
+really_inline SuperVector<32>::SuperVector(int32_t const other)
 {
     u.v256[0] = _mm256_set1_epi32(other);
 }
 
 template<>
 template<>
-really_inline SuperVector<32>::SuperVector<uint32_t>(uint32_t const other)
+really_inline SuperVector<32>::SuperVector(uint32_t const other)
 {
     u.v256[0] = _mm256_set1_epi32(static_cast<int32_t>(other));
 }
 
 template<>
 template<>
-really_inline SuperVector<32>::SuperVector<int64_t>(int64_t const other)
+really_inline SuperVector<32>::SuperVector(int64_t const other)
 {
     u.v256[0] = _mm256_set1_epi64x(other);
 }
 
 template<>
 template<>
-really_inline SuperVector<32>::SuperVector<uint64_t>(uint64_t const other)
+really_inline SuperVector<32>::SuperVector(uint64_t const other)
 {
     u.v256[0] = _mm256_set1_epi64x(static_cast<int64_t>(other));
 }
@@ -804,7 +804,7 @@ really_inline SuperVector<32> SuperVector<32>::vshl_128_imm() const
 
 template <>
 template<uint8_t N>
-really_inline SuperVector<16> SuperVector<32>::vshl_256_imm() const
+really_inline SuperVector<32> SuperVector<32>::vshl_256_imm() const
 {
     if (N == 0) return *this;
     if (N == 16) return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0))};
@@ -950,11 +950,11 @@ really_inline SuperVector<32> SuperVector<32>::vshl_256(uint8_t const N) const
     SuperVector result;
     Unroller<1, 16>::iterator([&,v=this](auto const i) {
         constexpr uint8_t n = i.value;
-        if (N == n) result = {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 16 - n)};;
+        if (N == n) result = {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(v->u.v256[0], v->u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 16 - n)};;
     });
     Unroller<17, 32>::iterator([&,v=this](auto const i) {
         constexpr uint8_t n = i.value;
-        if (N == n) result = {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), n - 16)};
+        if (N == n) result = {_mm256_slli_si256(_mm256_permute2x128_si256(v->u.v256[0], v->u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), n - 16)};
     });
     return result;
 }
@@ -1240,56 +1240,56 @@ really_inline SuperVector<64>::SuperVector(m128 const v)
 
 template<>
 template<>
-really_inline SuperVector<64>::SuperVector<int8_t>(int8_t const o)
+really_inline SuperVector<64>::SuperVector(int8_t const o)
 {
     u.v512[0] = _mm512_set1_epi8(o);
 }
 
 template<>
 template<>
-really_inline SuperVector<64>::SuperVector<uint8_t>(uint8_t const o)
+really_inline SuperVector<64>::SuperVector(uint8_t const o)
 {
     u.v512[0] = _mm512_set1_epi8(static_cast<int8_t>(o));
 }
 
 template<>
 template<>
-really_inline SuperVector<64>::SuperVector<int16_t>(int16_t const o)
+really_inline SuperVector<64>::SuperVector(int16_t const o)
 {
     u.v512[0] = _mm512_set1_epi16(o);
 }
 
 template<>
 template<>
-really_inline SuperVector<64>::SuperVector<uint16_t>(uint16_t const o)
+really_inline SuperVector<64>::SuperVector(uint16_t const o)
 {
     u.v512[0] = _mm512_set1_epi16(static_cast<int16_t>(o));
 }
 
 template<>
 template<>
-really_inline SuperVector<64>::SuperVector<int32_t>(int32_t const o)
+really_inline SuperVector<64>::SuperVector(int32_t const o)
 {
     u.v512[0] = _mm512_set1_epi32(o);
 }
 
 template<>
 template<>
-really_inline SuperVector<64>::SuperVector<uint32_t>(uint32_t const o)
+really_inline SuperVector<64>::SuperVector(uint32_t const o)
 {
     u.v512[0] = _mm512_set1_epi32(static_cast<int32_t>(o));
 }
 
 template<>
 template<>
-really_inline SuperVector<64>::SuperVector<int64_t>(int64_t const o)
+really_inline SuperVector<64>::SuperVector(int64_t const o)
 {
     u.v512[0] = _mm512_set1_epi64(o);
 }
 
 template<>
 template<>
-really_inline SuperVector<64>::SuperVector<uint64_t>(uint64_t const o)
+really_inline SuperVector<64>::SuperVector(uint64_t const o)
 {
     u.v512[0] = _mm512_set1_epi64(static_cast<int64_t>(o));
 }
diff --git a/src/util/supervector/supervector.hpp b/src/util/supervector/supervector.hpp
index 737412f6..3ab3b13f 100644
--- a/src/util/supervector/supervector.hpp
+++ b/src/util/supervector/supervector.hpp
@@ -174,9 +174,7 @@ public:
     int16x8_t  ALIGN_ATTR(BaseVector<16>::size) s16x8[SIZE / BaseVector<16>::size];
     uint8x16_t ALIGN_ATTR(BaseVector<16>::size) u8x16[SIZE / BaseVector<16>::size];
     int8x16_t  ALIGN_ATTR(BaseVector<16>::size) s8x16[SIZE / BaseVector<16>::size];
-#endif
-
-#if defined(ARCH_PPC64EL)
+#elif defined(ARCH_PPC64EL)
     __vector uint64_t ALIGN_ATTR(BaseVector<16>::size) u64x2[SIZE / BaseVector<16>::size];
     __vector int64_t  ALIGN_ATTR(BaseVector<16>::size) s64x2[SIZE / BaseVector<16>::size];
     __vector uint32_t ALIGN_ATTR(BaseVector<16>::size) u32x4[SIZE / BaseVector<16>::size];
@@ -200,7 +198,7 @@ public:
   } u;
 
   constexpr SuperVector() {};
-  constexpr SuperVector(SuperVector const &other)
+  SuperVector(SuperVector const &other)
   :u(other.u) {};
   SuperVector(typename base_type::type const v);
 

From 1f4143de81fab6619a44aa6ae175e1cec2e51992 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Wed, 1 Dec 2021 23:23:37 +0200
Subject: [PATCH 04/17] rework CMakeLists.txt to ensure it works with clang

---
 CMakeLists.txt | 284 ++++++++++++++++++++++++++-----------------------
 1 file changed, 153 insertions(+), 131 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a741961c..90395329 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,7 @@ project (vectorscan C CXX)
 
 set (HS_MAJOR_VERSION 5)
 set (HS_MINOR_VERSION 4)
-set (HS_PATCH_VERSION 3)
+set (HS_PATCH_VERSION 5)
 set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION})
 
 set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
@@ -128,11 +128,9 @@ CMAKE_DEPENDENT_OPTION(DUMP_SUPPORT "Dump code support; normally on, except in r
 
 CMAKE_DEPENDENT_OPTION(DISABLE_ASSERTS "Disable assert(); Asserts are enabled in debug builds, disabled in release builds" OFF "NOT RELEASE_BUILD" ON)
 
-option(BUILD_AVX512 "Experimental: support avx512 in the fat runtime"
-    OFF)
+option(BUILD_AVX512 "Experimental: support avx512 in the fat runtime" OFF)
 
-option(BUILD_AVX512VBMI "Experimental: support avx512vbmi in the fat runtime"
-    OFF)
+option(BUILD_AVX512VBMI "Experimental: support avx512vbmi in the fat runtime" OFF)
 
 if (BUILD_AVX512VBMI)
     set(BUILD_AVX512 ON)
@@ -140,47 +138,71 @@ endif ()
 
 # TODO: per platform config files?
 
-    # remove CMake's idea of optimisation
-    foreach (CONFIG ${CMAKE_BUILD_TYPE} ${CMAKE_CONFIGURATION_TYPES})
-        string(REGEX REPLACE "-O[^ ]*" "" CMAKE_C_FLAGS_${CONFIG} "${CMAKE_C_FLAGS_${CONFIG}}")
-        string(REGEX REPLACE "-O[^ ]*" "" CMAKE_CXX_FLAGS_${CONFIG} "${CMAKE_CXX_FLAGS_${CONFIG}}")
-    endforeach ()
+# remove CMake's idea of optimisation
+foreach (CONFIG ${CMAKE_BUILD_TYPE} ${CMAKE_CONFIGURATION_TYPES})
+    string(REGEX REPLACE "-O[^ ]*" "" CMAKE_C_FLAGS_${CONFIG} "${CMAKE_C_FLAGS_${CONFIG}}")
+    string(REGEX REPLACE "-O[^ ]*" "" CMAKE_CXX_FLAGS_${CONFIG} "${CMAKE_CXX_FLAGS_${CONFIG}}")
+endforeach ()
 
-    if (CMAKE_COMPILER_IS_GNUCC AND NOT CROSS_COMPILE_AARCH64 AND NOT ARCH_PPC64EL)
-        message(STATUS "gcc version ${CMAKE_C_COMPILER_VERSION}")
-        # If gcc doesn't recognise the host cpu, then mtune=native becomes
-        # generic, which isn't very good in some cases. march=native looks at
-        # cpuid info and then chooses the best microarch it can (and replaces
-        # the flag), so use that for tune.
+if (CMAKE_C_COMPILER_ID MATCHES "Intel")
+    set(SKYLAKE_FLAG "-xCORE-AVX512")
+else ()
+    set(SKYLAKE_FLAG "-march=skylake-avx512")
+    set(ICELAKE_FLAG "-march=icelake-server")
+endif ()
 
-        # arg1 might exist if using ccache
-        string (STRIP "${CMAKE_C_COMPILER_ARG1}" CC_ARG1)
-        set (EXEC_ARGS ${CC_ARG1} -c -Q --help=target -march=native -mtune=native)
-        execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS}
-            OUTPUT_VARIABLE _GCC_OUTPUT)
-        string(FIND "${_GCC_OUTPUT}" "march" POS)
-        string(SUBSTRING "${_GCC_OUTPUT}" ${POS} -1 _GCC_OUTPUT)
-        string(REGEX REPLACE "march=[ \t]*([^ \n]*)[ \n].*" "\\1"
-            GNUCC_ARCH "${_GCC_OUTPUT}")
+# Detect best GNUCC_ARCH to tune for
+if (CMAKE_COMPILER_IS_GNUCC AND NOT CROSS_COMPILE)
+    message(STATUS "gcc version ${CMAKE_C_COMPILER_VERSION}")
+    # If gcc doesn't recognise the host cpu, then mtune=native becomes
+    # generic, which isn't very good in some cases. march=native looks at
+    # cpuid info and then chooses the best microarch it can (and replaces
+    # the flag), so use that for tune.
 
-        if (ARCH_IA32 OR ARCH_X86_64)
-            # test the parsed flag
-            set (EXEC_ARGS ${CC_ARG1} -E - -mtune=${GNUCC_ARCH})
-            execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS}
-                OUTPUT_QUIET ERROR_QUIET
-                INPUT_FILE /dev/null
-                RESULT_VARIABLE GNUCC_TUNE_TEST)
-            if (NOT GNUCC_TUNE_TEST EQUAL 0)
-                message(SEND_ERROR "Something went wrong determining gcc tune: -mtune=${GNUCC_ARCH} not valid")
-            endif()
-            set(TUNE_FLAG ${GNUCC_ARCH})
-        else()
-            set(TUNE_FLAG native)
-        endif()
-    elseif (NOT TUNE_FLAG)
+    # arg1 might exist if using ccache
+    string (STRIP "${CMAKE_C_COMPILER_ARG1}" CC_ARG1)
+    set (EXEC_ARGS ${CC_ARG1} -c -Q --help=target -march=native -mtune=native)
+    execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS}
+        OUTPUT_VARIABLE _GCC_OUTPUT)
+    string(FIND "${_GCC_OUTPUT}" "march" POS)
+    string(SUBSTRING "${_GCC_OUTPUT}" ${POS} -1 _GCC_OUTPUT)
+    string(REGEX REPLACE "march=[ \t]*([^ \n]*)[ \n].*" "\\1" GNUCC_ARCH "${_GCC_OUTPUT}")
+
+    # test the parsed flag
+    set (EXEC_ARGS ${CC_ARG1} -E - -mtune=${GNUCC_ARCH})
+    execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS}
+        OUTPUT_QUIET ERROR_QUIET
+        INPUT_FILE /dev/null
+        RESULT_VARIABLE GNUCC_TUNE_TEST)
+    if (NOT GNUCC_TUNE_TEST EQUAL 0)
+        message(SEND_ERROR "Something went wrong determining gcc tune: -mtune=${GNUCC_ARCH} not valid")
         set(TUNE_FLAG native)
+    else()
+        set(TUNE_FLAG ${GNUCC_ARCH})
     endif()
+    message(STATUS "gcc will tune for ${GNUCC_ARCH}")
+elseif (CMAKE_COMPILER_IS_CLANG AND NOT CROSS_COMPILE)
+    set(GNUCC_ARCH native)
+    set(TUNE_FLAG generic)
+    message(STATUS "clang will tune for ${TUNE_FLAG}")
+    if (BUILD_AVX512)
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SKYLAKE_FLAG}")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SKYLAKE_FLAG}")
+    elseif (BUILD_AVX2)
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2")
+    else()
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse4.2")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2")
+    endif()
+    message(STATUS "${CMAKE_C_FLAGS}")
+    message(STATUS "${CMAKE_CXX_FLAGS}")
+elseif (CROSS_COMPILE)
+    set(GNUCC_ARCH generic)
+    set(TUNE_FLAG generic)
+endif()
 
+if (ARCH_AARCH64)
     if (BUILD_SVE2_BITPERM)
         set(GNUCC_ARCH "${GNUCC_ARCH}+sve2-bitperm")
     elseif (BUILD_SVE2)
@@ -188,92 +210,88 @@ endif ()
     elseif (BUILD_SVE)
         set(GNUCC_ARCH "${GNUCC_ARCH}+sve")
     endif ()
+endif(ARCH_AARCH64)
 
-    # compiler version checks TODO: test more compilers
-    if (CMAKE_COMPILER_IS_GNUCXX)
-        set(GNUCXX_MINVER "4.8.1")
-        message(STATUS "g++ version ${CMAKE_CXX_COMPILER_VERSION}")
-        if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS GNUCXX_MINVER)
-            message(FATAL_ERROR "A minimum of g++ ${GNUCXX_MINVER} is required for C++11 support")
-        endif()
-    endif()
-
-    if(RELEASE_BUILD)
-        if (NOT CMAKE_BUILD_TYPE MATCHES MINSIZEREL)
-            set(OPT_C_FLAG "-O3")
-            set(OPT_CXX_FLAG "-O3")
-        else ()
-            set(OPT_C_FLAG "-Os")
-            set(OPT_CXX_FLAG "-Os")
-        endif ()
-    else()
-        set(OPT_C_FLAG "-O0")
-        set(OPT_CXX_FLAG "-O0")
-    endif(RELEASE_BUILD)
-
-    # set compiler flags - more are tested and added later
-    set(EXTRA_C_FLAGS "${OPT_C_FLAG} -std=c17 -Wall -Wextra -Wshadow -Wcast-qual -fno-strict-aliasing")
-    set(EXTRA_CXX_FLAGS "${OPT_CXX_FLAG} -std=c++17 -Wall -Wextra -Wshadow -Wswitch -Wreturn-type -Wcast-qual -Wno-deprecated -Wnon-virtual-dtor -fno-strict-aliasing -fno-new-ttp-matching")
-
-    if (NOT RELEASE_BUILD)
-        # -Werror is most useful during development, don't potentially break
-        # release builds
-        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Werror")
-        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Werror")
-    endif()
-
-    if (DISABLE_ASSERTS)
-        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -DNDEBUG")
-        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -DNDEBUG")
-    endif()
-
-    
-    if (ARCH_IA32 OR ARCH_X86_64 OR ARCH_ARM32 OR ARCH_AARCH64)
-	 if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*)
+if (ARCH_IA32 OR ARCH_X86_64 OR ARCH_ARM32 OR ARCH_AARCH64)
+    if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*)
             set(ARCH_C_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}")
-   	 endif()
+    endif()
 	 
-	 if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*)
-            set(ARCH_CXX_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}")
-         endif()
+	  if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*)
+        set(ARCH_CXX_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}")
     endif()
+endif()
     
-    if(ARCH_PPC64EL)
-        if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*)
-            set(ARCH_C_FLAGS "-mtune=${TUNE_FLAG}")
-        endif()
-        if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*)
-            set(ARCH_CXX_FLAGS "-mtune=${TUNE_FLAG}")
-        endif()
+if(ARCH_PPC64EL)
+    if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*)
+        set(ARCH_C_FLAGS "-mtune=${TUNE_FLAG}")
     endif()
-
-    if(CMAKE_COMPILER_IS_GNUCC)
-        # spurious warnings?
-        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-array-bounds -Wno-maybe-uninitialized")
+    if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*)
+        set(ARCH_CXX_FLAGS "-mtune=${TUNE_FLAG}")
     endif()
+endif()
 
-    if(CMAKE_COMPILER_IS_GNUCXX)
-        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-maybe-uninitialized")
-        if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0)
-            set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fabi-version=0")
-        endif ()
-        # don't complain about abi
-        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-abi")
-        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-abi")
+# compiler version checks TODO: test more compilers
+if (CMAKE_COMPILER_IS_GNUCXX)
+    set(GNUCXX_MINVER "10")
+    message(STATUS "g++ version ${CMAKE_CXX_COMPILER_VERSION}")
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS GNUCXX_MINVER)
+        message(FATAL_ERROR "A minimum of g++ ${GNUCXX_MINVER} is required for C++17 support")
     endif()
+endif()
 
-    if (NOT(ARCH_IA32 AND RELEASE_BUILD))
-        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -fno-omit-frame-pointer")
-        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fno-omit-frame-pointer")
-    endif()
-
-
-    if (CMAKE_C_COMPILER_ID MATCHES "Intel")
-        set(SKYLAKE_FLAG "-xCORE-AVX512")
+if(RELEASE_BUILD)
+    if (NOT CMAKE_BUILD_TYPE MATCHES MINSIZEREL)
+        set(OPT_C_FLAG "-O3")
+        set(OPT_CXX_FLAG "-O3")
     else ()
-        set(SKYLAKE_FLAG "-march=skylake-avx512")
-        set(ICELAKE_FLAG "-march=icelake-server")
+        set(OPT_C_FLAG "-Os")
+        set(OPT_CXX_FLAG "-Os")
     endif ()
+else()
+    set(OPT_C_FLAG "-O0")
+    set(OPT_CXX_FLAG "-O0")
+endif(RELEASE_BUILD)
+
+# set compiler flags - more are tested and added later
+set(EXTRA_C_FLAGS "${OPT_C_FLAG} -std=c17 -Wall -Wextra -Wshadow -Wcast-qual -fno-strict-aliasing")
+set(EXTRA_CXX_FLAGS "${OPT_CXX_FLAG} -std=c++17 -Wall -Wextra -Wshadow -Wswitch -Wreturn-type -Wcast-qual -Wno-deprecated -Wnon-virtual-dtor -fno-strict-aliasing")
+if (NOT CMAKE_COMPILER_IS_CLANG)
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fno-new-ttp-matching")
+endif()
+
+if (NOT RELEASE_BUILD)
+    # -Werror is most useful during development, don't potentially break
+    # release builds
+    set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Werror")
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Werror")
+endif()
+
+if (DISABLE_ASSERTS)
+    set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -DNDEBUG")
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -DNDEBUG")
+endif()
+
+if(CMAKE_COMPILER_IS_GNUCC)
+    # spurious warnings?
+    set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-array-bounds -Wno-maybe-uninitialized")
+endif()
+
+if(CMAKE_COMPILER_IS_GNUCXX)
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-maybe-uninitialized")
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0)
+        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fabi-version=0")
+    endif ()
+    # don't complain about abi
+    set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-abi")
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-abi")
+endif()
+
+if (NOT(ARCH_IA32 AND RELEASE_BUILD))
+    set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -fno-omit-frame-pointer")
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fno-omit-frame-pointer")
+endif()
+
 
 CHECK_INCLUDE_FILES(unistd.h HAVE_UNISTD_H)
 if (ARCH_IA32 OR ARCH_X86_64)
@@ -289,8 +307,6 @@ elseif (ARCH_ARM32 OR ARCH_AARCH64)
       message(FATAL_ERROR "arm_sve.h is required to build for SVE.")
     endif()
   endif()
-  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -flax-vector-conversions")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -flax-vector-conversions")
 elseif (ARCH_PPC64EL)
   CHECK_INCLUDE_FILE_CXX(altivec.h HAVE_C_PPC64EL_ALTIVEC_H)
 endif()
@@ -318,8 +334,7 @@ if (CMAKE_SYSTEM_NAME MATCHES "Linux")
     # This is a Linux-only feature for now - requires platform support
     # elsewhere
     message(STATUS "generator is ${CMAKE_GENERATOR}")
-    if (CMAKE_C_COMPILER_ID MATCHES "Clang" AND
-        CMAKE_C_COMPILER_VERSION VERSION_LESS "3.9")
+    if (CMAKE_C_COMPILER_IS_CLANG AND CMAKE_C_COMPILER_VERSION VERSION_LESS "3.9")
         message (STATUS "Clang v3.9 or higher required for fat runtime, cannot build fat runtime")
         set (FAT_RUNTIME_REQUISITES FALSE)
     elseif (NOT (CMAKE_GENERATOR MATCHES "Unix Makefiles" OR
@@ -343,7 +358,10 @@ include (${CMAKE_MODULE_PATH}/arch.cmake)
 # testing a builtin takes a little more work
 CHECK_C_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CC_BUILTIN_ASSUME_ALIGNED)
 CHECK_CXX_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CXX_BUILTIN_ASSUME_ALIGNED)
-CHECK_C_SOURCE_COMPILES("int main(void) { __builtin_constant_p(0); }" HAVE__BUILTIN_CONSTANT_P)
+# Clang does not use __builtin_constant_p() the same way as gcc
+if (NOT CMAKE_COMPILER_IS_CLANG)
+   CHECK_C_SOURCE_COMPILES("int main(void) { __builtin_constant_p(0); }" HAVE__BUILTIN_CONSTANT_P)
+endif()
 
 set(C_FLAGS_TO_CHECK
 # Variable length arrays are way bad, most especially at run time
@@ -442,18 +460,22 @@ if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
     set(FREEBSD true)
 endif(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
 
-if (NOT FAT_RUNTIME)
-    if (CROSS_COMPILE_AARCH64)
+if (FAT_RUNTIME)
+    if (NOT (ARCH_IA32 OR ARCH_X86_64))
+        message(FATAL_ERROR "Fat runtime is not supported on non-Intel architectures")
+    else()
+        message(STATUS "Building runtime for multiple microarchitectures")
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+    endif()
+else()
+    if (CROSS_COMPILE)
         message(STATUS "Building for target CPU: ${ARCH_C_FLAGS}")
     else()
         message(STATUS "Building for current host CPU: ${ARCH_C_FLAGS}")
     endif()
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ARCH_C_FLAGS}")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_CXX_FLAGS}")
-else()
-    message(STATUS "Building runtime for multiple microarchitectures")
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 endif()
 
 add_subdirectory(util)
@@ -1171,8 +1193,8 @@ if (NOT FAT_RUNTIME)
         set_target_properties(hs_runtime PROPERTIES LINKER_LANGUAGE C)
 
         add_library(hs_compile OBJECT ${hs_compile_SRCS})
-	if (ARCH_IA32)
-            set_target_properties(hs_compile PROPERTIES COMPILE_FLAGS "-mssse3")
+        if (ARCH_IA32)
+            set_target_properties(hs_compile PROPERTIES COMPILE_FLAGS "-msse4.2")
         endif (ARCH_IA32)
 
         add_library(hs STATIC
@@ -1212,7 +1234,7 @@ else (FAT_RUNTIME)
         add_library(hs_exec_corei7 OBJECT ${hs_exec_SRCS})
         list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_corei7>)
         set_target_properties(hs_exec_corei7 PROPERTIES
-            COMPILE_FLAGS "-march=corei7 -mssse3"
+            COMPILE_FLAGS "-march=corei7 -msse4.2"
             RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} corei7 ${CMAKE_MODULE_PATH}/keep.syms.in"
             )
 
@@ -1255,8 +1277,8 @@ else (FAT_RUNTIME)
         set_target_properties(hs_runtime PROPERTIES LINKER_LANGUAGE C)
         add_library(hs_compile OBJECT ${hs_compile_SRCS})
         if (ARCH_IA32 OR ARCH_X86_64)
-            set_target_properties(hs_exec_common PROPERTIES COMPILE_FLAGS "-mssse3")
-            set_target_properties(hs_compile PROPERTIES COMPILE_FLAGS "-mssse3")
+            set_target_properties(hs_exec_common PROPERTIES COMPILE_FLAGS "-msse4.2")
+            set_target_properties(hs_compile PROPERTIES COMPILE_FLAGS "-msse4.2")
         endif ()
 
         # we want the static lib for testing
@@ -1281,7 +1303,7 @@ else (FAT_RUNTIME)
         add_library(hs_exec_shared_corei7 OBJECT ${hs_exec_SRCS})
         list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_corei7>)
         set_target_properties(hs_exec_shared_corei7 PROPERTIES
-            COMPILE_FLAGS "-march=corei7 -mssse3"
+            COMPILE_FLAGS "-march=corei7 -msse4.2"
             POSITION_INDEPENDENT_CODE TRUE
             RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} corei7 ${CMAKE_MODULE_PATH}/keep.syms.in"
             )

From 5d23e6dab67473f34d5814ba2c9967d19ae11dbd Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Wed, 1 Dec 2021 21:45:31 +0000
Subject: [PATCH 05/17] set -msse4.2 only on Intel

---
 CMakeLists.txt | 38 +++++++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 90395329..d61b4a4a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -182,21 +182,30 @@ if (CMAKE_COMPILER_IS_GNUCC AND NOT CROSS_COMPILE)
     endif()
     message(STATUS "gcc will tune for ${GNUCC_ARCH}")
 elseif (CMAKE_COMPILER_IS_CLANG AND NOT CROSS_COMPILE)
-    set(GNUCC_ARCH native)
-    set(TUNE_FLAG generic)
     message(STATUS "clang will tune for ${TUNE_FLAG}")
-    if (BUILD_AVX512)
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SKYLAKE_FLAG}")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SKYLAKE_FLAG}")
-    elseif (BUILD_AVX2)
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2")
+    if (ARCH_IA32 OR ARCH_X86_64)
+        set(GNUCC_ARCH native)
+        set(TUNE_FLAG generic)
+        if (BUILD_AVX512)
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SKYLAKE_FLAG}")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SKYLAKE_FLAG}")
+        elseif (BUILD_AVX2)
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2")
+        else()
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse4.2")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2")
+        endif()
+    elseif(ARCH_AARCH64)
+       set(GNUCC_ARCH armv8)
+       set(TUNE_FLAG generic)
+    elseif(ARCH_ARM32)
+       set(GNUCC_ARCH armv7a)
+       set(TUNE_FLAG generic)
     else()
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse4.2")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2")
+       set(GNUCC_ARCH native)
+       set(TUNE_FLAG generic)
     endif()
-    message(STATUS "${CMAKE_C_FLAGS}")
-    message(STATUS "${CMAKE_CXX_FLAGS}")
 elseif (CROSS_COMPILE)
     set(GNUCC_ARCH generic)
     set(TUNE_FLAG generic)
@@ -214,10 +223,9 @@ endif(ARCH_AARCH64)
 
 if (ARCH_IA32 OR ARCH_X86_64 OR ARCH_ARM32 OR ARCH_AARCH64)
     if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*)
-            set(ARCH_C_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}")
+        set(ARCH_C_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}")
     endif()
-	 
-	  if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*)
+    if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*)
         set(ARCH_CXX_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}")
     endif()
 endif()

From 4aa32275f16282829cc58b9efb1c50dcabd53d14 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <markos@freevec.org>
Date: Thu, 2 Dec 2021 18:00:02 +0200
Subject: [PATCH 06/17] use same definition of the union for all types

---
 src/util/supervector/supervector.hpp | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/src/util/supervector/supervector.hpp b/src/util/supervector/supervector.hpp
index 3ab3b13f..f0ddf63c 100644
--- a/src/util/supervector/supervector.hpp
+++ b/src/util/supervector/supervector.hpp
@@ -165,7 +165,7 @@ public:
     typename BaseVector<32>::type ALIGN_ATTR(BaseVector<32>::size) v256[SIZE / BaseVector<32>::size];
     typename BaseVector<64>::type ALIGN_ATTR(BaseVector<64>::size) v512[SIZE / BaseVector<64>::size];
 
-#if defined(ARCH_ARM32) || defined(ARCH_AARCH64)
+#if defined(ARCH_ARM32) || defined(ARCH_AARCH64) || defined(ARCH_PPC64EL)
     uint64x2_t ALIGN_ATTR(BaseVector<16>::size) u64x2[SIZE / BaseVector<16>::size];
     int64x2_t  ALIGN_ATTR(BaseVector<16>::size) s64x2[SIZE / BaseVector<16>::size];
     uint32x4_t ALIGN_ATTR(BaseVector<16>::size) u32x4[SIZE / BaseVector<16>::size];
@@ -174,15 +174,6 @@ public:
     int16x8_t  ALIGN_ATTR(BaseVector<16>::size) s16x8[SIZE / BaseVector<16>::size];
     uint8x16_t ALIGN_ATTR(BaseVector<16>::size) u8x16[SIZE / BaseVector<16>::size];
     int8x16_t  ALIGN_ATTR(BaseVector<16>::size) s8x16[SIZE / BaseVector<16>::size];
-#elif defined(ARCH_PPC64EL)
-    __vector uint64_t ALIGN_ATTR(BaseVector<16>::size) u64x2[SIZE / BaseVector<16>::size];
-    __vector int64_t  ALIGN_ATTR(BaseVector<16>::size) s64x2[SIZE / BaseVector<16>::size];
-    __vector uint32_t ALIGN_ATTR(BaseVector<16>::size) u32x4[SIZE / BaseVector<16>::size];
-    __vector int32_t  ALIGN_ATTR(BaseVector<16>::size) s32x4[SIZE / BaseVector<16>::size];
-    __vector uint16_t ALIGN_ATTR(BaseVector<16>::size) u16x8[SIZE / BaseVector<16>::size];
-    __vector int16_t  ALIGN_ATTR(BaseVector<16>::size) s16x8[SIZE / BaseVector<16>::size];
-    __vector uint8_t  ALIGN_ATTR(BaseVector<16>::size) u8x16[SIZE / BaseVector<16>::size];
-    __vector int8_t   ALIGN_ATTR(BaseVector<16>::size) s8x16[SIZE / BaseVector<16>::size];	
 #endif
 
     uint64_t u64[SIZE / sizeof(uint64_t)];

From 5aae719ecdeea8b917176956555e67fc58bc27be Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <markos@freevec.org>
Date: Thu, 2 Dec 2021 18:01:00 +0200
Subject: [PATCH 07/17] fix build with clang, in particular VSX uses long long
 instead of int64_t, gcc allows this, clang does not

---
 src/util/arch/ppc64el/simd_types.h          |  2 +-
 src/util/arch/ppc64el/simd_utils.h          | 22 ++++++--
 src/util/supervector/arch/ppc64el/impl.cpp  | 62 +++++++++------------
 src/util/supervector/arch/ppc64el/types.hpp | 14 ++++-
 4 files changed, 57 insertions(+), 43 deletions(-)

diff --git a/src/util/arch/ppc64el/simd_types.h b/src/util/arch/ppc64el/simd_types.h
index 21dae5cb..8a5b0e25 100644
--- a/src/util/arch/ppc64el/simd_types.h
+++ b/src/util/arch/ppc64el/simd_types.h
@@ -30,7 +30,7 @@
 #define ARCH_PPC64EL_SIMD_TYPES_H
 
 #if !defined(m128) && defined(HAVE_VSX)
-typedef __vector int32_t m128;
+typedef __vector int m128;
 #endif
 
 #endif /* ARCH_PPC64EL_SIMD_TYPES_H  */
diff --git a/src/util/arch/ppc64el/simd_utils.h b/src/util/arch/ppc64el/simd_utils.h
index 137fc94f..d046ed47 100644
--- a/src/util/arch/ppc64el/simd_utils.h
+++ b/src/util/arch/ppc64el/simd_utils.h
@@ -43,6 +43,18 @@
 
 #include <string.h> // for memcpy
 
+typedef __vector unsigned long long int  uint64x2_t;
+typedef __vector   signed long long int   int64x2_t;
+typedef __vector unsigned int            uint32x4_t;
+typedef __vector   signed int             int32x4_t;
+typedef __vector unsigned short int      uint16x8_t;
+typedef __vector   signed short int       int16x8_t;
+typedef __vector unsigned char           uint8x16_t;
+typedef __vector  signed char             int8x16_t;
+
+typedef unsigned long long int ulong64_t;
+typedef   signed long long int  long64_t;
+/*
 typedef __vector  uint64_t uint64x2_t;
 typedef __vector   int64_t  int64x2_t;
 typedef __vector  uint32_t uint32x4_t;
@@ -50,7 +62,7 @@ typedef __vector   int32_t  int32x4_t;
 typedef __vector  uint16_t uint16x8_t;
 typedef __vector   int16_t  int16x8_t;
 typedef __vector   uint8_t uint8x16_t;
-typedef __vector    int8_t  int8x16_t;
+typedef __vector    int8_t  int8x16_t;*/
 
 
 #define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0
@@ -182,13 +194,13 @@ m128 rshift_m128(m128 a, unsigned b) {
 
 static really_really_inline
 m128 lshift64_m128(m128 a, unsigned b) {
-  uint64x2_t shift_indices = vec_splats((uint64_t)b); 
+  uint64x2_t shift_indices = vec_splats((ulong64_t)b); 
   return (m128) vec_sl((int64x2_t)a, shift_indices);
 }
 
 static really_really_inline
 m128 rshift64_m128(m128 a, unsigned  b) {
-  uint64x2_t shift_indices = vec_splats((uint64_t)b); 
+  uint64x2_t shift_indices = vec_splats((ulong64_t)b); 
   return (m128) vec_sr((int64x2_t)a, shift_indices);
 }
 
@@ -213,11 +225,11 @@ static really_inline u32 movemask128(m128 a) {
    uint32x4_t s3 = vec_or((uint32x4_t)ss2, res_and2);
    
    uint64x2_t ss3 = vec_sr((uint64x2_t)s3, (uint64x2_t)vec_splats(28));
-   uint64x2_t res_and3 = vec_and((uint64x2_t)s3, vec_splats((uint64_t)0xff));
+   uint64x2_t res_and3 = vec_and((uint64x2_t)s3, vec_splats((ulong64_t)0xff));
    uint64x2_t s4 = vec_or((uint64x2_t)ss3, res_and3);
   
    uint64x2_t ss4 = vec_sld((uint64x2_t)vec_splats(0), s4, 9);
-   uint64x2_t res_and4 = vec_and((uint64x2_t)s4, vec_splats((uint64_t)0xff));
+   uint64x2_t res_and4 = vec_and((uint64x2_t)s4, vec_splats((ulong64_t)0xff));
    uint64x2_t s5 = vec_or((uint64x2_t)ss4, res_and4);
  
    return s5[0];
diff --git a/src/util/supervector/arch/ppc64el/impl.cpp b/src/util/supervector/arch/ppc64el/impl.cpp
index e054e02e..109b8d5e 100644
--- a/src/util/supervector/arch/ppc64el/impl.cpp
+++ b/src/util/supervector/arch/ppc64el/impl.cpp
@@ -39,16 +39,6 @@
 #include "util/supervector/supervector.hpp"
 #include <iostream>
 
-
-typedef __vector uint64_t uint64x2_t;
-typedef __vector  int64_t  int64x2_t;
-typedef __vector uint32_t uint32x4_t;
-typedef __vector  int32_t  int32x4_t;
-typedef __vector uint16_t uint16x8_t;
-typedef __vector  int16_t  int16x8_t;
-typedef __vector  uint8_t uint8x16_t;
-typedef __vector   int8_t  int8x16_t;
-
 // 128-bit Powerpc64le implementation
 
 template<>
@@ -65,58 +55,58 @@ really_inline SuperVector<16>::SuperVector(typename base_type::type const v)
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<int8_t>(int8_t const other)
+really_inline SuperVector<16>::SuperVector(int8_t const other)
 {
     u.v128[0] = (m128) vec_splats(other);
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<uint8_t>(uint8_t const other)
+really_inline SuperVector<16>::SuperVector(uint8_t const other)
 {
     u.v128[0] = (m128) vec_splats(static_cast<uint8_t>(other));
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<int16_t>(int16_t const other)
+really_inline SuperVector<16>::SuperVector(int16_t const other)
 {
     u.v128[0] = (m128) vec_splats(other);
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<uint16_t>(uint16_t const other)
+really_inline SuperVector<16>::SuperVector(uint16_t const other)
 {
     u.v128[0] = (m128) vec_splats(static_cast<uint16_t>(other));
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<int32_t>(int32_t const other)
+really_inline SuperVector<16>::SuperVector(int32_t const other)
 {
     u.v128[0] = (m128) vec_splats(other);
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<uint32_t>(uint32_t const other)
+really_inline SuperVector<16>::SuperVector(uint32_t const other)
 {
     u.v128[0] = (m128) vec_splats(static_cast<uint32_t>(other));
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<int64_t>(int64_t const other)
+really_inline SuperVector<16>::SuperVector(int64_t const other)
 {
-    u.v128[0] = (m128) vec_splats(other);
+    u.v128[0] = (m128) vec_splats(static_cast<ulong64_t>(other));
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<uint64_t>(uint64_t const other)
+really_inline SuperVector<16>::SuperVector(uint64_t const other)
 {
-    u.v128[0] = (m128) vec_splats(static_cast<uint64_t>(other));
+    u.v128[0] = (m128) vec_splats(static_cast<ulong64_t>(other));
 }
 
 // Constants
@@ -229,11 +219,11 @@ really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(
     uint32x4_t s3 = vec_or((uint32x4_t)ss2, res_and2);
 
     uint64x2_t ss3 = vec_sr((uint64x2_t)s3, (uint64x2_t)vec_splats(28));
-    uint64x2_t res_and3 = vec_and((uint64x2_t)s3, vec_splats((uint64_t)0xff));
+    uint64x2_t res_and3 = vec_and((uint64x2_t)s3, vec_splats((ulong64_t)0xff));
     uint64x2_t s4 = vec_or((uint64x2_t)ss3, res_and3);
 
     uint64x2_t ss4 = vec_sld((uint64x2_t) vec_splats(0), s4, 9);
-    uint64x2_t res_and4 = vec_and((uint64x2_t)s4, vec_splats((uint64_t)0xff));
+    uint64x2_t res_and4 = vec_and((uint64x2_t)s4, vec_splats((ulong64_t)0xff));
     uint64x2_t s5 = vec_or((uint64x2_t)ss4, res_and4);
     
     return s5[0];
@@ -271,7 +261,7 @@ template <>
 template<uint8_t N>
 really_inline SuperVector<16> SuperVector<16>::vshl_64_imm() const
 {
-    return { (m128) vec_sl(u.s64x2[0], vec_splats((uint64_t)N)) };
+    return { (m128) vec_sl(u.s64x2[0], vec_splats((ulong64_t)N)) };
 }
 
 template <>
@@ -313,7 +303,7 @@ template <>
 template<uint8_t N>
 really_inline SuperVector<16> SuperVector<16>::vshr_64_imm() const
 {		 
-   return { (m128) vec_sr(u.s64x2[0], vec_splats((uint64_t)N)) }; 
+   return { (m128) vec_sr(u.s64x2[0], vec_splats((ulong64_t)N)) }; 
 }
 
 template <>
@@ -352,7 +342,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_8  (uint8_t const N) const
     if (N == 0) return *this;
     if (N == 16) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(u.s8x16[0], vec_splats((uint8_t)n))}; });
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s8x16[0], vec_splats((uint8_t)n))}; });
     return result;
 }
 
@@ -362,7 +352,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const UNUSED N)
     if (N == 0) return *this;
     if (N == 16) return Zeroes();
     SuperVector result; 
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(u.s16x8[0], vec_splats((uint16_t)n))}; });
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s16x8[0], vec_splats((uint16_t)n))}; });
     return result;
 }
 
@@ -372,7 +362,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const
     if (N == 0) return *this;
     if (N == 16) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(u.s32x4[0], vec_splats((uint32_t)n))}; });
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s32x4[0], vec_splats((uint32_t)n))}; });
     return result;
 }
 
@@ -382,7 +372,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const
     if (N == 0) return *this;
     if (N == 16) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(u.s64x2[0], vec_splats((uint64_t)n))}; });
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s64x2[0], vec_splats((ulong64_t)n))}; });
     return result;
 }
 
@@ -392,7 +382,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const
     if (N == 0) return *this;
     if (N == 16) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sld(u.s8x16[0], (int8x16_t)vec_splat_s8(0), n)}; });
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sld(v->u.s8x16[0], (int8x16_t)vec_splat_s8(0), n)}; });
     return result;
 }
 
@@ -408,7 +398,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_8  (uint8_t const N) const
     if (N == 0) return *this;
     if (N == 16) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(u.s8x16[0], vec_splats((uint8_t)n))}; });
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s8x16[0], vec_splats((uint8_t)n))}; });
     return result;
 }
 
@@ -418,7 +408,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const
     if (N == 0) return *this;
     if (N == 16) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(u.s16x8[0], vec_splats((uint16_t)n))}; });
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s16x8[0], vec_splats((uint16_t)n))}; });
     return result;
 }
 
@@ -428,7 +418,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const
     if (N == 0) return *this;
     if (N == 16) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(u.s32x4[0], vec_splats((uint32_t)n))}; });
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s32x4[0], vec_splats((uint32_t)n))}; });
     return result;
 }
 
@@ -438,7 +428,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const
     if (N == 0) return *this;
     if (N == 16) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(u.s64x2[0], vec_splats((uint64_t)n))}; });
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s64x2[0], vec_splats((ulong64_t)n))}; });
     return result;
 }
 
@@ -448,7 +438,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const UNUSED N)
     if (N == 0) return *this;
     if (N == 16) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sld((int8x16_t)vec_splat_u8(0), u.s8x16[0], 16 - n)}; });
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sld((int8x16_t)vec_splat_u8(0), v->u.s8x16[0], 16 - n)}; });
     return result;
 }
 
@@ -523,14 +513,14 @@ really_inline SuperVector<16> SuperVector<16>::Ones_vshl(uint8_t const N)
 template <>
 really_inline SuperVector<16> SuperVector<16>::loadu(void const *ptr)
 {
-    return (m128) vec_xl(0, (const int64_t*)ptr);
+    return (m128) vec_xl(0, (const long64_t*)ptr);
 }
 
 template <>
 really_inline SuperVector<16> SuperVector<16>::load(void const *ptr)
 {
     assert(ISALIGNED_N(ptr, alignof(SuperVector::size)));
-    return (m128)  vec_xl(0, (const int64_t*)ptr);
+    return (m128)  vec_xl(0, (const long64_t*)ptr);
 }
 
 template <>
diff --git a/src/util/supervector/arch/ppc64el/types.hpp b/src/util/supervector/arch/ppc64el/types.hpp
index dbd863f4..bdc6608e 100644
--- a/src/util/supervector/arch/ppc64el/types.hpp
+++ b/src/util/supervector/arch/ppc64el/types.hpp
@@ -27,6 +27,18 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
+typedef __vector unsigned long long int  uint64x2_t;
+typedef __vector   signed long long int   int64x2_t;
+typedef __vector unsigned int            uint32x4_t;
+typedef __vector   signed int             int32x4_t;
+typedef __vector unsigned short int      uint16x8_t;
+typedef __vector   signed short int       int16x8_t;
+typedef __vector unsigned char           uint8x16_t;
+typedef __vector  signed char             int8x16_t;
+
+typedef unsigned long long int ulong64_t;
+typedef   signed long long int  long64_t;
+
 #if !defined(m128) && defined(HAVE_VSX)
-typedef __vector int32_t m128;
+typedef __vector int m128;
 #endif

From 451d539f1d3e89fe885429aeba4a47b1327cd505 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <markos@freevec.org>
Date: Thu, 2 Dec 2021 18:01:26 +0200
Subject: [PATCH 08/17] Power does not use -march

---
 CMakeLists.txt | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d61b4a4a..10829fb8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -154,6 +154,12 @@ endif ()
 # Detect best GNUCC_ARCH to tune for
 if (CMAKE_COMPILER_IS_GNUCC AND NOT CROSS_COMPILE)
     message(STATUS "gcc version ${CMAKE_C_COMPILER_VERSION}")
+
+    if(ARCH_PPC64EL)
+        set(ARCH_FLAG mcpu)
+    else()
+        set(ARCH_FLAG march)
+    endif()
     # If gcc doesn't recognise the host cpu, then mtune=native becomes
     # generic, which isn't very good in some cases. march=native looks at
     # cpuid info and then chooses the best microarch it can (and replaces
@@ -161,12 +167,12 @@ if (CMAKE_COMPILER_IS_GNUCC AND NOT CROSS_COMPILE)
 
     # arg1 might exist if using ccache
     string (STRIP "${CMAKE_C_COMPILER_ARG1}" CC_ARG1)
-    set (EXEC_ARGS ${CC_ARG1} -c -Q --help=target -march=native -mtune=native)
+    set (EXEC_ARGS ${CC_ARG1} -c -Q --help=target -${ARCH_FLAG}=native -mtune=native)
     execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS}
         OUTPUT_VARIABLE _GCC_OUTPUT)
-    string(FIND "${_GCC_OUTPUT}" "march" POS)
+    string(FIND "${_GCC_OUTPUT}" "${ARCH_FLAG}" POS)
     string(SUBSTRING "${_GCC_OUTPUT}" ${POS} -1 _GCC_OUTPUT)
-    string(REGEX REPLACE "march=[ \t]*([^ \n]*)[ \n].*" "\\1" GNUCC_ARCH "${_GCC_OUTPUT}")
+    string(REGEX REPLACE "${ARCH_FLAG}=[ \t]*([^ \n]*)[ \n].*" "\\1" GNUCC_ARCH "${_GCC_OUTPUT}")
 
     # test the parsed flag
     set (EXEC_ARGS ${CC_ARG1} -E - -mtune=${GNUCC_ARCH})

From 6b364021d190113fec9d770d3d00e9dfb640cee5 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Thu, 2 Dec 2021 23:09:34 +0200
Subject: [PATCH 09/17] don't fail if mtune does not return a valid
 configuration

---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 10829fb8..9c58fd46 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -181,12 +181,12 @@ if (CMAKE_COMPILER_IS_GNUCC AND NOT CROSS_COMPILE)
         INPUT_FILE /dev/null
         RESULT_VARIABLE GNUCC_TUNE_TEST)
     if (NOT GNUCC_TUNE_TEST EQUAL 0)
-        message(SEND_ERROR "Something went wrong determining gcc tune: -mtune=${GNUCC_ARCH} not valid")
+        message(WARNING "Something went wrong determining gcc tune: -mtune=${GNUCC_ARCH} not valid, falling back to -mtune=native")
         set(TUNE_FLAG native)
     else()
         set(TUNE_FLAG ${GNUCC_ARCH})
+        message(STATUS "gcc will tune for ${GNUCC_ARCH}")
     endif()
-    message(STATUS "gcc will tune for ${GNUCC_ARCH}")
 elseif (CMAKE_COMPILER_IS_CLANG AND NOT CROSS_COMPILE)
     message(STATUS "clang will tune for ${TUNE_FLAG}")
     if (ARCH_IA32 OR ARCH_X86_64)

From 7cad5143662c6b83df86d78e385ec7f04e528a2b Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Thu, 2 Dec 2021 23:09:53 +0200
Subject: [PATCH 10/17] clang is more strict

---
 unit/internal/simd_utils.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp
index 900078bb..bc2421dc 100644
--- a/unit/internal/simd_utils.cpp
+++ b/unit/internal/simd_utils.cpp
@@ -667,7 +667,7 @@ TEST(SimdUtilsTest, movq) {
     simd = _mm_set_epi64x(~0LL, 0x123456789abcdef);
 #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
     int64x2_t a = { 0x123456789abcdefLL, ~0LL };
-    simd = vreinterpretq_s64_s8(a);
+    simd = vreinterpretq_s32_s64(a);
 #elif defined(ARCH_PPC64EL)
     int64x2_t a = {0x123456789abcdefLL, ~0LL };
     simd = (m128) a;

From 07ce6d8e7fb7d900da7d488c854f123a08e534b5 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Fri, 3 Dec 2021 16:24:58 +0200
Subject: [PATCH 11/17] fix build failures with clang on x86, make sure
 compilation works on other Power as well

---
 CMakeLists.txt        | 98 ++++++++++++++++++++++---------------------
 src/util/simd_types.h |  1 +
 util/CMakeLists.txt   |  3 --
 3 files changed, 51 insertions(+), 51 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9c58fd46..3485e5f8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -151,15 +151,16 @@ else ()
     set(ICELAKE_FLAG "-march=icelake-server")
 endif ()
 
+if(ARCH_PPC64EL)
+    set(ARCH_FLAG mcpu)
+else()
+    set(ARCH_FLAG march)
+endif()
+
 # Detect best GNUCC_ARCH to tune for
 if (CMAKE_COMPILER_IS_GNUCC AND NOT CROSS_COMPILE)
     message(STATUS "gcc version ${CMAKE_C_COMPILER_VERSION}")
 
-    if(ARCH_PPC64EL)
-        set(ARCH_FLAG mcpu)
-    else()
-        set(ARCH_FLAG march)
-    endif()
     # If gcc doesn't recognise the host cpu, then mtune=native becomes
     # generic, which isn't very good in some cases. march=native looks at
     # cpuid info and then chooses the best microarch it can (and replaces
@@ -185,23 +186,12 @@ if (CMAKE_COMPILER_IS_GNUCC AND NOT CROSS_COMPILE)
         set(TUNE_FLAG native)
     else()
         set(TUNE_FLAG ${GNUCC_ARCH})
-        message(STATUS "gcc will tune for ${GNUCC_ARCH}")
+        message(STATUS "gcc will tune for ${GNUCC_ARCH}, ${TUNE_FLAG}")
     endif()
 elseif (CMAKE_COMPILER_IS_CLANG AND NOT CROSS_COMPILE)
-    message(STATUS "clang will tune for ${TUNE_FLAG}")
     if (ARCH_IA32 OR ARCH_X86_64)
         set(GNUCC_ARCH native)
         set(TUNE_FLAG generic)
-        if (BUILD_AVX512)
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SKYLAKE_FLAG}")
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SKYLAKE_FLAG}")
-        elseif (BUILD_AVX2)
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2")
-        else()
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse4.2")
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2")
-        endif()
     elseif(ARCH_AARCH64)
        set(GNUCC_ARCH armv8)
        set(TUNE_FLAG generic)
@@ -212,11 +202,30 @@ elseif (CMAKE_COMPILER_IS_CLANG AND NOT CROSS_COMPILE)
        set(GNUCC_ARCH native)
        set(TUNE_FLAG generic)
     endif()
+    message(STATUS "clang will tune for ${GNUCC_ARCH}, ${TUNE_FLAG}")
 elseif (CROSS_COMPILE)
     set(GNUCC_ARCH generic)
     set(TUNE_FLAG generic)
 endif()
 
+if (ARCH_IA32 OR ARCH_X86_64)
+    if (NOT FAT_RUNTIME)
+        if (BUILD_AVX512)
+            set(ARCH_C_FLAGS "${SKYLAKE_FLAG}")
+            set(ARCH_CXX_FLAGS "${SKYLAKE_FLAG}")
+        elseif (BUILD_AVX2)
+            set(ARCH_C_FLAGS "-mavx2")
+            set(ARCH_CXX_FLAGS "-mavx2")
+        else()
+            set(ARCH_C_FLAGS "-msse4.2")
+            set(ARCH_CXX_FLAGS "-msse4.2")
+        endif()
+    else()
+       set(ARCH_C_FLAGS "-msse4.2")
+       set(ARCH_CXX_FLAGS "-msse4.2")
+    endif()
+endif()
+
 if (ARCH_AARCH64)
     if (BUILD_SVE2_BITPERM)
         set(GNUCC_ARCH "${GNUCC_ARCH}+sve2-bitperm")
@@ -227,23 +236,26 @@ if (ARCH_AARCH64)
     endif ()
 endif(ARCH_AARCH64)
 
-if (ARCH_IA32 OR ARCH_X86_64 OR ARCH_ARM32 OR ARCH_AARCH64)
-    if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*)
-        set(ARCH_C_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}")
-    endif()
-    if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*)
-        set(ARCH_CXX_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}")
-    endif()
-endif()
+set(ARCH_C_FLAGS "-${ARCH_FLAG}=${GNUCC_ARCH} -mtune=${TUNE_FLAG} ${ARCH_C_FLAGS}")
+set(ARCH_CXX_FLAGS "-${ARCH_FLAG}=${GNUCC_ARCH} -mtune=${TUNE_FLAG} ${ARCH_CXX_FLAGS}")
+
+#if (ARCH_IA32 OR ARCH_X86_64 OR ARCH_ARM32 OR ARCH_AARCH64)
+#    if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*)
+#        set(ARCH_C_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}")
+#    endif()
+#    if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*)
+#        set(ARCH_CXX_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}")
+#    endif()
+#endif()
     
-if(ARCH_PPC64EL)
-    if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*)
-        set(ARCH_C_FLAGS "-mtune=${TUNE_FLAG}")
-    endif()
-    if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*)
-        set(ARCH_CXX_FLAGS "-mtune=${TUNE_FLAG}")
-    endif()
-endif()
+#if(ARCH_PPC64EL)
+#    if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*)
+#        set(ARCH_C_FLAGS "-mtune=${TUNE_FLAG}")
+#    endif()
+#    if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*)
+#        set(ARCH_CXX_FLAGS "-mtune=${TUNE_FLAG}")
+#    endif()
+#endif()
 
 # compiler version checks TODO: test more compilers
 if (CMAKE_COMPILER_IS_GNUCXX)
@@ -306,7 +318,6 @@ if (NOT(ARCH_IA32 AND RELEASE_BUILD))
     set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fno-omit-frame-pointer")
 endif()
 
-
 CHECK_INCLUDE_FILES(unistd.h HAVE_UNISTD_H)
 if (ARCH_IA32 OR ARCH_X86_64)
   CHECK_INCLUDE_FILES(intrin.h HAVE_C_INTRIN_H)
@@ -474,13 +485,12 @@ if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
     set(FREEBSD true)
 endif(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
 
+
 if (FAT_RUNTIME)
     if (NOT (ARCH_IA32 OR ARCH_X86_64))
         message(FATAL_ERROR "Fat runtime is not supported on non-Intel architectures")
     else()
         message(STATUS "Building runtime for multiple microarchitectures")
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
     endif()
 else()
     if (CROSS_COMPILE)
@@ -488,9 +498,9 @@ else()
     else()
         message(STATUS "Building for current host CPU: ${ARCH_C_FLAGS}")
     endif()
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ARCH_C_FLAGS}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_CXX_FLAGS}")
 endif()
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ARCH_C_FLAGS}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_CXX_FLAGS}")
 
 add_subdirectory(util)
 add_subdirectory(doc/dev-reference)
@@ -1207,10 +1217,6 @@ if (NOT FAT_RUNTIME)
         set_target_properties(hs_runtime PROPERTIES LINKER_LANGUAGE C)
 
         add_library(hs_compile OBJECT ${hs_compile_SRCS})
-        if (ARCH_IA32)
-            set_target_properties(hs_compile PROPERTIES COMPILE_FLAGS "-msse4.2")
-        endif (ARCH_IA32)
-
         add_library(hs STATIC
             src/hs_version.c
             src/hs_valid_platform.c
@@ -1241,7 +1247,7 @@ else (FAT_RUNTIME)
         add_library(hs_exec_core2 OBJECT ${hs_exec_SRCS})
         list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_core2>)
         set_target_properties(hs_exec_core2 PROPERTIES
-            COMPILE_FLAGS "-march=core2"
+            COMPILE_FLAGS "-march=core2 -msse4.2"
             RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} core2 ${CMAKE_MODULE_PATH}/keep.syms.in"
             )
 
@@ -1290,10 +1296,6 @@ else (FAT_RUNTIME)
             ${RUNTIME_LIBS})
         set_target_properties(hs_runtime PROPERTIES LINKER_LANGUAGE C)
         add_library(hs_compile OBJECT ${hs_compile_SRCS})
-        if (ARCH_IA32 OR ARCH_X86_64)
-            set_target_properties(hs_exec_common PROPERTIES COMPILE_FLAGS "-msse4.2")
-            set_target_properties(hs_compile PROPERTIES COMPILE_FLAGS "-msse4.2")
-        endif ()
 
         # we want the static lib for testing
         add_library(hs STATIC src/hs_version.c src/hs_valid_platform.c
@@ -1310,7 +1312,7 @@ else (FAT_RUNTIME)
         add_library(hs_exec_shared_core2 OBJECT ${hs_exec_SRCS})
         list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_core2>)
         set_target_properties(hs_exec_shared_core2 PROPERTIES
-            COMPILE_FLAGS "-march=core2"
+            COMPILE_FLAGS "-march=core2 -msse4.2"
             POSITION_INDEPENDENT_CODE TRUE
             RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} core2 ${CMAKE_MODULE_PATH}/keep.syms.in"
             )
diff --git a/src/util/simd_types.h b/src/util/simd_types.h
index 0deff7e5..4f0fd1a9 100644
--- a/src/util/simd_types.h
+++ b/src/util/simd_types.h
@@ -51,6 +51,7 @@ typedef struct ALIGN_AVX_DIRECTIVE {m128 lo; m128 hi;} m256;
 #endif
 
 typedef struct {m128 lo; m128 mid; m128 hi;} m384;
+
 #if !defined(m512) && !defined(HAVE_SIMD_512_BITS)
 typedef struct ALIGN_ATTR(64) {m256 lo; m256 hi;} m512;
 #endif
diff --git a/util/CMakeLists.txt b/util/CMakeLists.txt
index 82cee0ff..ea942ef1 100644
--- a/util/CMakeLists.txt
+++ b/util/CMakeLists.txt
@@ -33,9 +33,6 @@ SET(corpusomatic_SRCS
     ng_find_matches.cpp
 )
 add_library(corpusomatic STATIC ${corpusomatic_SRCS})
-if (ARCH_IA32 OR ARCH_X86_64)
-    set_target_properties(corpusomatic PROPERTIES COMPILE_FLAGS "-mssse3")
-endif ()
 
 set(databaseutil_SRCS
     database_util.cpp

From 290eabbca08e7e591ea53cfe3bf37bce5bc7f9fb Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Mon, 6 Dec 2021 18:22:58 +0000
Subject: [PATCH 12/17] fix compilation with clang and some incomplete/wrong
 implementations for arm this time

---
 src/util/arch/arm/simd_utils.h         | 238 ++++++++++++++++++++++++-
 src/util/supervector/arch/arm/impl.cpp |  62 +++----
 2 files changed, 264 insertions(+), 36 deletions(-)

diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h
index 4c68b485..96cd332c 100644
--- a/src/util/arch/arm/simd_utils.h
+++ b/src/util/arch/arm/simd_utils.h
@@ -122,24 +122,252 @@ m128 sub_2x64(m128 a, m128 b) {
     return (m128) vsubq_u64((uint64x2_t)a, (uint64x2_t)b);
 }
 
-static really_really_inline
+static really_inline
 m128 lshift_m128(m128 a, unsigned b) {
-    return (m128) vshlq_n_u32((uint32x4_t)a, b);
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(b)) {
+        return (m128) vshlq_n_u32((uint32x4_t)a, b);
+    }
+#endif
+#define CASE_LSHIFT_m128(a, offset)  case offset: return (m128)vshlq_n_u32((int8x16_t)(a), (offset)); break;
+    switch (b) {
+    case 0:  return a; break;
+    CASE_LSHIFT_m128(a,  1);
+    CASE_LSHIFT_m128(a,  2);
+    CASE_LSHIFT_m128(a,  3);
+    CASE_LSHIFT_m128(a,  4);
+    CASE_LSHIFT_m128(a,  5);
+    CASE_LSHIFT_m128(a,  6);
+    CASE_LSHIFT_m128(a,  7);
+    CASE_LSHIFT_m128(a,  8);
+    CASE_LSHIFT_m128(a,  9);
+    CASE_LSHIFT_m128(a, 10);
+    CASE_LSHIFT_m128(a, 11);
+    CASE_LSHIFT_m128(a, 12);
+    CASE_LSHIFT_m128(a, 13);
+    CASE_LSHIFT_m128(a, 14);
+    CASE_LSHIFT_m128(a, 15);
+    CASE_LSHIFT_m128(a, 16);
+    CASE_LSHIFT_m128(a, 17);
+    CASE_LSHIFT_m128(a, 18);
+    CASE_LSHIFT_m128(a, 19);
+    CASE_LSHIFT_m128(a, 20);
+    CASE_LSHIFT_m128(a, 21);
+    CASE_LSHIFT_m128(a, 22);
+    CASE_LSHIFT_m128(a, 23);
+    CASE_LSHIFT_m128(a, 24);
+    CASE_LSHIFT_m128(a, 25);
+    CASE_LSHIFT_m128(a, 26);
+    CASE_LSHIFT_m128(a, 27);
+    CASE_LSHIFT_m128(a, 28);
+    CASE_LSHIFT_m128(a, 29);
+    CASE_LSHIFT_m128(a, 30);
+    CASE_LSHIFT_m128(a, 31);
+    default: return zeroes128(); break;
+    }
+#undef CASE_LSHIFT_m128
 }
 
 static really_really_inline
 m128 rshift_m128(m128 a, unsigned b) {
-    return (m128) vshrq_n_u32((uint32x4_t)a, b);
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(b)) {
+        return (m128) vshrq_n_u32((uint32x4_t)a, b);
+    }
+#endif
+#define CASE_RSHIFT_m128(a, offset)  case offset: return (m128)vshrq_n_u32((int8x16_t)(a), (offset)); break;
+    switch (b) {
+    case 0:  return a; break;
+    CASE_RSHIFT_m128(a,  1);
+    CASE_RSHIFT_m128(a,  2);
+    CASE_RSHIFT_m128(a,  3);
+    CASE_RSHIFT_m128(a,  4);
+    CASE_RSHIFT_m128(a,  5);
+    CASE_RSHIFT_m128(a,  6);
+    CASE_RSHIFT_m128(a,  7);
+    CASE_RSHIFT_m128(a,  8);
+    CASE_RSHIFT_m128(a,  9);
+    CASE_RSHIFT_m128(a, 10);
+    CASE_RSHIFT_m128(a, 11);
+    CASE_RSHIFT_m128(a, 12);
+    CASE_RSHIFT_m128(a, 13);
+    CASE_RSHIFT_m128(a, 14);
+    CASE_RSHIFT_m128(a, 15);
+    CASE_RSHIFT_m128(a, 16);
+    CASE_RSHIFT_m128(a, 17);
+    CASE_RSHIFT_m128(a, 18);
+    CASE_RSHIFT_m128(a, 19);
+    CASE_RSHIFT_m128(a, 20);
+    CASE_RSHIFT_m128(a, 21);
+    CASE_RSHIFT_m128(a, 22);
+    CASE_RSHIFT_m128(a, 23);
+    CASE_RSHIFT_m128(a, 24);
+    CASE_RSHIFT_m128(a, 25);
+    CASE_RSHIFT_m128(a, 26);
+    CASE_RSHIFT_m128(a, 27);
+    CASE_RSHIFT_m128(a, 28);
+    CASE_RSHIFT_m128(a, 29);
+    CASE_RSHIFT_m128(a, 30);
+    CASE_RSHIFT_m128(a, 31);
+    default: return zeroes128(); break;
+    }
+#undef CASE_RSHIFT_m128
 }
 
 static really_really_inline
 m128 lshift64_m128(m128 a, unsigned b) {
-    return (m128) vshlq_n_u64((uint64x2_t)a, b);
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(b)) {
+        return (m128) vshlq_n_u64((uint64x2_t)a, b);
+    }
+#endif
+#define CASE_LSHIFT64_m128(a, offset)  case offset: return (m128)vshlq_n_u64((int8x16_t)(a), (offset)); break;
+    switch (b) {
+    case 0:  return a; break;
+    CASE_LSHIFT64_m128(a,  1);
+    CASE_LSHIFT64_m128(a,  2);
+    CASE_LSHIFT64_m128(a,  3);
+    CASE_LSHIFT64_m128(a,  4);
+    CASE_LSHIFT64_m128(a,  5);
+    CASE_LSHIFT64_m128(a,  6);
+    CASE_LSHIFT64_m128(a,  7);
+    CASE_LSHIFT64_m128(a,  8);
+    CASE_LSHIFT64_m128(a,  9);
+    CASE_LSHIFT64_m128(a, 10);
+    CASE_LSHIFT64_m128(a, 11);
+    CASE_LSHIFT64_m128(a, 12);
+    CASE_LSHIFT64_m128(a, 13);
+    CASE_LSHIFT64_m128(a, 14);
+    CASE_LSHIFT64_m128(a, 15);
+    CASE_LSHIFT64_m128(a, 16);
+    CASE_LSHIFT64_m128(a, 17);
+    CASE_LSHIFT64_m128(a, 18);
+    CASE_LSHIFT64_m128(a, 19);
+    CASE_LSHIFT64_m128(a, 20);
+    CASE_LSHIFT64_m128(a, 21);
+    CASE_LSHIFT64_m128(a, 22);
+    CASE_LSHIFT64_m128(a, 23);
+    CASE_LSHIFT64_m128(a, 24);
+    CASE_LSHIFT64_m128(a, 25);
+    CASE_LSHIFT64_m128(a, 26);
+    CASE_LSHIFT64_m128(a, 27);
+    CASE_LSHIFT64_m128(a, 28);
+    CASE_LSHIFT64_m128(a, 29);
+    CASE_LSHIFT64_m128(a, 30);
+    CASE_LSHIFT64_m128(a, 31);
+    CASE_LSHIFT64_m128(a, 32);
+    CASE_LSHIFT64_m128(a, 33);
+    CASE_LSHIFT64_m128(a, 34);
+    CASE_LSHIFT64_m128(a, 35);
+    CASE_LSHIFT64_m128(a, 36);
+    CASE_LSHIFT64_m128(a, 37);
+    CASE_LSHIFT64_m128(a, 38);
+    CASE_LSHIFT64_m128(a, 39);
+    CASE_LSHIFT64_m128(a, 40);
+    CASE_LSHIFT64_m128(a, 41);
+    CASE_LSHIFT64_m128(a, 42);
+    CASE_LSHIFT64_m128(a, 43);
+    CASE_LSHIFT64_m128(a, 44);
+    CASE_LSHIFT64_m128(a, 45);
+    CASE_LSHIFT64_m128(a, 46);
+    CASE_LSHIFT64_m128(a, 47);
+    CASE_LSHIFT64_m128(a, 48);
+    CASE_LSHIFT64_m128(a, 49);
+    CASE_LSHIFT64_m128(a, 50);
+    CASE_LSHIFT64_m128(a, 51);
+    CASE_LSHIFT64_m128(a, 52);
+    CASE_LSHIFT64_m128(a, 53);
+    CASE_LSHIFT64_m128(a, 54);
+    CASE_LSHIFT64_m128(a, 55);
+    CASE_LSHIFT64_m128(a, 56);
+    CASE_LSHIFT64_m128(a, 57);
+    CASE_LSHIFT64_m128(a, 58);
+    CASE_LSHIFT64_m128(a, 59);
+    CASE_LSHIFT64_m128(a, 60);
+    CASE_LSHIFT64_m128(a, 61);
+    CASE_LSHIFT64_m128(a, 62);
+    CASE_LSHIFT64_m128(a, 63);
+    default: return zeroes128(); break;
+    }
+#undef CASE_LSHIFT64_m128
 }
 
 static really_really_inline
 m128 rshift64_m128(m128 a, unsigned b) {
-    return (m128) vshrq_n_u64((uint64x2_t)a, b);
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(b)) {
+        return (m128) vshrq_n_u64((uint64x2_t)a, b);
+    }
+#endif
+#define CASE_RSHIFT64_m128(a, offset)  case offset: return (m128)vshrq_n_u64((int8x16_t)(a), (offset)); break;
+    switch (b) {
+    case 0:  return a; break;
+    CASE_RSHIFT64_m128(a,  1);
+    CASE_RSHIFT64_m128(a,  2);
+    CASE_RSHIFT64_m128(a,  3);
+    CASE_RSHIFT64_m128(a,  4);
+    CASE_RSHIFT64_m128(a,  5);
+    CASE_RSHIFT64_m128(a,  6);
+    CASE_RSHIFT64_m128(a,  7);
+    CASE_RSHIFT64_m128(a,  8);
+    CASE_RSHIFT64_m128(a,  9);
+    CASE_RSHIFT64_m128(a, 10);
+    CASE_RSHIFT64_m128(a, 11);
+    CASE_RSHIFT64_m128(a, 12);
+    CASE_RSHIFT64_m128(a, 13);
+    CASE_RSHIFT64_m128(a, 14);
+    CASE_RSHIFT64_m128(a, 15);
+    CASE_RSHIFT64_m128(a, 16);
+    CASE_RSHIFT64_m128(a, 17);
+    CASE_RSHIFT64_m128(a, 18);
+    CASE_RSHIFT64_m128(a, 19);
+    CASE_RSHIFT64_m128(a, 20);
+    CASE_RSHIFT64_m128(a, 21);
+    CASE_RSHIFT64_m128(a, 22);
+    CASE_RSHIFT64_m128(a, 23);
+    CASE_RSHIFT64_m128(a, 24);
+    CASE_RSHIFT64_m128(a, 25);
+    CASE_RSHIFT64_m128(a, 26);
+    CASE_RSHIFT64_m128(a, 27);
+    CASE_RSHIFT64_m128(a, 28);
+    CASE_RSHIFT64_m128(a, 29);
+    CASE_RSHIFT64_m128(a, 30);
+    CASE_RSHIFT64_m128(a, 31);
+    CASE_RSHIFT64_m128(a, 32);
+    CASE_RSHIFT64_m128(a, 33);
+    CASE_RSHIFT64_m128(a, 34);
+    CASE_RSHIFT64_m128(a, 35);
+    CASE_RSHIFT64_m128(a, 36);
+    CASE_RSHIFT64_m128(a, 37);
+    CASE_RSHIFT64_m128(a, 38);
+    CASE_RSHIFT64_m128(a, 39);
+    CASE_RSHIFT64_m128(a, 40);
+    CASE_RSHIFT64_m128(a, 41);
+    CASE_RSHIFT64_m128(a, 42);
+    CASE_RSHIFT64_m128(a, 43);
+    CASE_RSHIFT64_m128(a, 44);
+    CASE_RSHIFT64_m128(a, 45);
+    CASE_RSHIFT64_m128(a, 46);
+    CASE_RSHIFT64_m128(a, 47);
+    CASE_RSHIFT64_m128(a, 48);
+    CASE_RSHIFT64_m128(a, 49);
+    CASE_RSHIFT64_m128(a, 50);
+    CASE_RSHIFT64_m128(a, 51);
+    CASE_RSHIFT64_m128(a, 52);
+    CASE_RSHIFT64_m128(a, 53);
+    CASE_RSHIFT64_m128(a, 54);
+    CASE_RSHIFT64_m128(a, 55);
+    CASE_RSHIFT64_m128(a, 56);
+    CASE_RSHIFT64_m128(a, 57);
+    CASE_RSHIFT64_m128(a, 58);
+    CASE_RSHIFT64_m128(a, 59);
+    CASE_RSHIFT64_m128(a, 60);
+    CASE_RSHIFT64_m128(a, 61);
+    CASE_RSHIFT64_m128(a, 62);
+    CASE_RSHIFT64_m128(a, 63);
+    default: return zeroes128(); break;
+    }
+#undef CASE_RSHIFT64_m128
 }
 
 static really_inline m128 eq128(m128 a, m128 b) {
diff --git a/src/util/supervector/arch/arm/impl.cpp b/src/util/supervector/arch/arm/impl.cpp
index 980f0b39..ff1149a9 100644
--- a/src/util/supervector/arch/arm/impl.cpp
+++ b/src/util/supervector/arch/arm/impl.cpp
@@ -45,112 +45,112 @@ really_inline SuperVector<16>::SuperVector(typename base_type::type const v)
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<int8x16_t>(int8x16_t other)
+really_inline SuperVector<16>::SuperVector(int8x16_t other)
 {
     u.s8x16[0] = other;
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<uint8x16_t>(uint8x16_t other)
+really_inline SuperVector<16>::SuperVector(uint8x16_t other)
 {
     u.u8x16[0] = other;
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<int16x8_t>(int16x8_t other)
+really_inline SuperVector<16>::SuperVector(int16x8_t other)
 {
     u.s16x8[0] = other;
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<uint16x8_t>(uint16x8_t other)
+really_inline SuperVector<16>::SuperVector(uint16x8_t other)
 {
     u.u16x8[0] = other;
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<int32x4_t>(int32x4_t other)
+really_inline SuperVector<16>::SuperVector(int32x4_t other)
 {
     u.s32x4[0] = other;
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<uint32x4_t>(uint32x4_t other)
+really_inline SuperVector<16>::SuperVector(uint32x4_t other)
 {
     u.u32x4[0] = other;
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<int64x2_t>(int64x2_t other)
+really_inline SuperVector<16>::SuperVector(int64x2_t other)
 {
     u.s64x2[0] = other;
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<uint64x2_t>(uint64x2_t other)
+really_inline SuperVector<16>::SuperVector(uint64x2_t other)
 {
     u.u64x2[0] = other;
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<int8_t>(int8_t const other)
+really_inline SuperVector<16>::SuperVector(int8_t const other)
 {
     u.s8x16[0] = vdupq_n_s8(other);
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<uint8_t>(uint8_t const other)
+really_inline SuperVector<16>::SuperVector(uint8_t const other)
 {
     u.u8x16[0] = vdupq_n_u8(other);
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<int16_t>(int16_t const other)
+really_inline SuperVector<16>::SuperVector(int16_t const other)
 {
     u.s16x8[0] = vdupq_n_s16(other);
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<uint16_t>(uint16_t const other)
+really_inline SuperVector<16>::SuperVector(uint16_t const other)
 {
     u.u16x8[0] = vdupq_n_u16(other);
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<int32_t>(int32_t const other)
+really_inline SuperVector<16>::SuperVector(int32_t const other)
 {
     u.s32x4[0] = vdupq_n_s32(other);
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<uint32_t>(uint32_t const other)
+really_inline SuperVector<16>::SuperVector(uint32_t const other)
 {
     u.u32x4[0] = vdupq_n_u32(other);
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<int64_t>(int64_t const other)
+really_inline SuperVector<16>::SuperVector(int64_t const other)
 {
     u.s64x2[0] = vdupq_n_s64(other);
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<uint64_t>(uint64_t const other)
+really_inline SuperVector<16>::SuperVector(uint64_t const other)
 {
     u.u64x2[0] = vdupq_n_u64(other);
 }
@@ -376,7 +376,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_8  (uint8_t const N) const
     if (N == 0) return *this;
     if (N == 16) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u8(u.u8x16[0], n)}; });
+    Unroller<1, 8>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u8(v->u.u8x16[0], n)}; });
     return result;
 }
 
@@ -386,7 +386,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const N) const
     if (N == 0) return *this;
     if (N == 16) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u16(u.u16x8[0], n)}; });
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u16(v->u.u16x8[0], n)}; });
     return result;
 }
 
@@ -394,9 +394,9 @@ template <>
 really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const
 {
     if (N == 0) return *this;
-    if (N == 16) return Zeroes();
+    if (N == 32) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u32(u.u32x4[0], n)}; });
+    Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u32(v->u.u32x4[0], n)}; });
     return result;
 }
 
@@ -404,9 +404,9 @@ template <>
 really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const
 {
     if (N == 0) return *this;
-    if (N == 16) return Zeroes();
+    if (N == 64) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u64(u.u64x2[0], n)}; });
+    Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u64(v->u.u64x2[0], n)}; });
     return result;
 }
 
@@ -416,7 +416,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const
     if (N == 0) return *this;
     if (N == 16) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_u8(vdupq_n_u8(0), u.u8x16[0], 16 - n)}; });
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_u8(vdupq_n_u8(0), v->u.u8x16[0], 16 - n)}; });
     return result;
 }
 
@@ -430,9 +430,9 @@ template <>
 really_inline SuperVector<16> SuperVector<16>::vshr_8  (uint8_t const N) const
 {
     if (N == 0) return *this;
-    if (N == 16) return Zeroes();
+    if (N == 8) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u8(u.u8x16[0], n)}; });
+    Unroller<1, 8>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u8(v->u.u8x16[0], n)}; });
     return result;
 }
 
@@ -442,7 +442,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const
     if (N == 0) return *this;
     if (N == 16) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u16(u.u16x8[0], n)}; });
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u16(v->u.u16x8[0], n)}; });
     return result;
 }
 
@@ -450,9 +450,9 @@ template <>
 really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const
 {
     if (N == 0) return *this;
-    if (N == 16) return Zeroes();
+    if (N == 32) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u32(u.u32x4[0], n)}; });
+    Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u32(v->u.u32x4[0], n)}; });
     return result;
 }
 
@@ -460,9 +460,9 @@ template <>
 really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const
 {
     if (N == 0) return *this;
-    if (N == 16) return Zeroes();
+    if (N == 64) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u64(u.u64x2[0], n)}; });
+    Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u64(v->u.u64x2[0], n)}; });
     return result;
 }
 
@@ -472,7 +472,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const N) const
     if (N == 0) return *this;
     if (N == 16) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_u8(u.u8x16[0], vdupq_n_u8(0), n)}; });
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_u8(v->u.u8x16[0], vdupq_n_u8(0), n)}; });
     return result;
 }
 

From d3f0d8dd704a5500be641b693dcf1e361ec59f47 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Mon, 6 Dec 2021 18:38:01 +0000
Subject: [PATCH 13/17] update Jenkinsfile for all configurations

---
 Jenkinsfile | 606 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 587 insertions(+), 19 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 1883f43a..3dbef5b6 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -1,22 +1,590 @@
 pipeline {
-  agent {
-    node {
-      label 'x86'
-    }
-
-  }
-  stages {
-    stage('Release, SSE') {
-      agent {
-        node {
-          label 'x86'
+    agent none
+    stages {
+        stage("Build") {
+            failFast true
+            parallel {
+                stage("Release/SSE") {
+                    agent { label "x86" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-release-SSE', buildType: 'Release', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=no -DBUILD_AVX512=no -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-release-SSE/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-release-SSE/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Release/AVX2") {
+                    agent { label "x86" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-release-AVX2', buildType: 'Release', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=no -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-release-AVX2/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-release-AVX2/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Release/AVX512") {
+                    agent { label "x86" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-release-AVX512', buildType: 'Release', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=yes -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-release-AVX512/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-release-AVX512/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Release/FAT") {
+                    agent { label "x86" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-release-fat', buildType: 'Release', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=yes -DFAT_RUNTIME=yes', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-release-fat/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Debug/SSE") {
+                    agent { label "x86" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-debug-SSE', buildType: 'Debug', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=no -DBUILD_AVX512=no -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-debug-SSE/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-debug-SSE/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Debug/AVX2") {
+                    agent { label "x86" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-debug-AVX2', buildType: 'Debug', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=no -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-debug-AVX2/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-debug-AVX2/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Debug/AVX512") {
+                    agent { label "x86" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-debug-AVX512', buildType: 'Debug', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=yes -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-debug-AVX512/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-debug-AVX512/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Debug/FAT") {
+                    agent { label "x86" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-debug-fat', buildType: 'Debug', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=yes -DFAT_RUNTIME=yes', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-debug-fat/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Release/ARM") {
+                    agent { label "arm" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-release-arm', buildType: 'Release', cleanBuild: true, cmakeArgs: '', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-release-arm/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-release-arm/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Debug/ARM") {
+                    agent { label "arm" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-debug-arm', buildType: 'Debug', cleanBuild: true, cmakeArgs: '', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-debug-arm/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-debug-arm/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Release/Power") {
+                    agent { label "power" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-release-power', buildType: 'Release', cleanBuild: true, cmakeArgs: '', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-release-power/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-release-power/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Debug/Power") {
+                    agent { label "power" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-debug-power', buildType: 'Debug', cleanBuild: true, cmakeArgs: '', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-debug-power/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-debug-power/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Clang-Release/SSE") {
+                    agent { label "x86" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-clang-release-SSE', buildType: 'Release', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=no -DBUILD_AVX512=no -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-clang-release-SSE/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-clang-release-SSE/bin/unit-hyperscan'
+                            }
+                        }
+                    }
+                }
+                stage("Clang-Release/AVX2") {
+                    agent { label "x86" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-clang-release-AVX2', buildType: 'Release', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=no -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-clang-release-AVX2/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-clang-release-AVX2/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Clang-Release/AVX512") {
+                    agent { label "x86" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-clang-release-AVX512', buildType: 'Release', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=yes -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-clang-release-AVX512/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-clang-release-AVX512/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Clang-Release/FAT") {
+                    agent { label "x86" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-clang-release-fat', buildType: 'Release', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=yes -DFAT_RUNTIME=yes', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-clang-release-fat/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Clang-Debug/SSE") {
+                    agent { label "x86" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-clang-debug-SSE', buildType: 'Debug', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=no -DBUILD_AVX512=no -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-clang-debug-SSE/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-clang-debug-SSE/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Clang-Debug/AVX2") {
+                    agent { label "x86" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-clang-debug-AVX2', buildType: 'Debug', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=no -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-clang-debug-AVX2/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-clang-debug-AVX2/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Clang-Debug/AVX512") {
+                    agent { label "x86" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-clang-debug-AVX512', buildType: 'Debug', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=yes -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-clang-debug-AVX512/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-clang-debug-AVX512/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Clang-Debug/FAT") {
+                    agent { label "x86" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-clang-debug-fat', buildType: 'Debug', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=yes -DFAT_RUNTIME=yes', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-clang-debug-fat/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Clang-Release/ARM") {
+                    agent { label "arm" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-clang-release-arm', buildType: 'Release', cleanBuild: true, cmakeArgs: '', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-clang-release-arm/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-clang-release-arm/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Clang-Debug/ARM") {
+                    agent { label "arm" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-debug-arm', buildType: 'Debug', cleanBuild: true, cmakeArgs: '', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-clang-debug-arm/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-clang-debug-arm/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Clang-Release/Power") {
+                    agent { label "power" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-clang-release-power', buildType: 'Release', cleanBuild: true, cmakeArgs: '', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-clang-release-power/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-clang-release-power/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Clang-Debug/Power") {
+                    agent { label "power" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-clang-debug-power', buildType: 'Debug', cleanBuild: true, cmakeArgs: '', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-clang-debug-power/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-clang-debug-power/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+            }
         }
-
-      }
-      steps {
-        sh 'mkdir build-release-SSE &&  cmake -DCMAKE_BUILD_TYPE=Release   -C build-release-SSE'
-      }
     }
-
-  }
-}
\ No newline at end of file
+}

From deeb113977af4ef2fb72c6c7551cf56d19be3291 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Mon, 6 Dec 2021 21:35:37 +0000
Subject: [PATCH 14/17] lower gcc minver to 9 to enable building on Ubuntu 20
 LTS

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3485e5f8..76bca813 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -259,7 +259,7 @@ set(ARCH_CXX_FLAGS "-${ARCH_FLAG}=${GNUCC_ARCH} -mtune=${TUNE_FLAG} ${ARCH_CXX_F
 
 # compiler version checks TODO: test more compilers
 if (CMAKE_COMPILER_IS_GNUCXX)
-    set(GNUCXX_MINVER "10")
+    set(GNUCXX_MINVER "9")
     message(STATUS "g++ version ${CMAKE_CXX_COMPILER_VERSION}")
     if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS GNUCXX_MINVER)
         message(FATAL_ERROR "A minimum of g++ ${GNUCXX_MINVER} is required for C++17 support")

From fec557c1f9ca7d9eae4ca6a3e419a50bef674a06 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Mon, 6 Dec 2021 21:35:51 +0000
Subject: [PATCH 15/17] fix wrong castings for NEON

---
 src/util/arch/arm/simd_utils.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h
index 96cd332c..d1ab583f 100644
--- a/src/util/arch/arm/simd_utils.h
+++ b/src/util/arch/arm/simd_utils.h
@@ -129,7 +129,7 @@ m128 lshift_m128(m128 a, unsigned b) {
         return (m128) vshlq_n_u32((uint32x4_t)a, b);
     }
 #endif
-#define CASE_LSHIFT_m128(a, offset)  case offset: return (m128)vshlq_n_u32((int8x16_t)(a), (offset)); break;
+#define CASE_LSHIFT_m128(a, offset)  case offset: return (m128)vshlq_n_u32((uint32x4_t)(a), (offset)); break;
     switch (b) {
     case 0:  return a; break;
     CASE_LSHIFT_m128(a,  1);
@@ -175,7 +175,7 @@ m128 rshift_m128(m128 a, unsigned b) {
         return (m128) vshrq_n_u32((uint32x4_t)a, b);
     }
 #endif
-#define CASE_RSHIFT_m128(a, offset)  case offset: return (m128)vshrq_n_u32((int8x16_t)(a), (offset)); break;
+#define CASE_RSHIFT_m128(a, offset)  case offset: return (m128)vshrq_n_u32((uint32x4_t)(a), (offset)); break;
     switch (b) {
     case 0:  return a; break;
     CASE_RSHIFT_m128(a,  1);
@@ -221,7 +221,7 @@ m128 lshift64_m128(m128 a, unsigned b) {
         return (m128) vshlq_n_u64((uint64x2_t)a, b);
     }
 #endif
-#define CASE_LSHIFT64_m128(a, offset)  case offset: return (m128)vshlq_n_u64((int8x16_t)(a), (offset)); break;
+#define CASE_LSHIFT64_m128(a, offset)  case offset: return (m128)vshlq_n_u64((uint64x2_t)(a), (offset)); break;
     switch (b) {
     case 0:  return a; break;
     CASE_LSHIFT64_m128(a,  1);
@@ -299,7 +299,7 @@ m128 rshift64_m128(m128 a, unsigned b) {
         return (m128) vshrq_n_u64((uint64x2_t)a, b);
     }
 #endif
-#define CASE_RSHIFT64_m128(a, offset)  case offset: return (m128)vshrq_n_u64((int8x16_t)(a), (offset)); break;
+#define CASE_RSHIFT64_m128(a, offset)  case offset: return (m128)vshrq_n_u64((uint64x2_t)(a), (offset)); break;
     switch (b) {
     case 0:  return a; break;
     CASE_RSHIFT64_m128(a,  1);

From fd2eabd0716477e29008da6772c499b855f6d48c Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Tue, 7 Dec 2021 08:43:52 +0000
Subject: [PATCH 16/17] fix clang-release-arm compilation

---
 src/util/arch/arm/simd_utils.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h
index d1ab583f..764d26fd 100644
--- a/src/util/arch/arm/simd_utils.h
+++ b/src/util/arch/arm/simd_utils.h
@@ -419,9 +419,10 @@ m128 load_m128_from_u64a(const u64a *p) {
 }
 
 static really_inline u32 extract32from128(const m128 in, unsigned imm) {
-#if defined(HS_OPTIMIZE)
-    return vgetq_lane_u32((uint32x4_t) in, imm);
-#else
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(b)) {
+        return vgetq_lane_u32((uint32x4_t) in, imm);
+#endif
     switch (imm) {
     case 0:
         return vgetq_lane_u32((uint32x4_t) in, 0);
@@ -439,13 +440,13 @@ static really_inline u32 extract32from128(const m128 in, unsigned imm) {
 	return 0;
 	break;
     }
-#endif
 }
 
 static really_inline u64a extract64from128(const m128 in, unsigned imm) {
-#if defined(HS_OPTIMIZE)
-    return vgetq_lane_u64((uint64x2_t) in, imm);
-#else
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(b)) {
+        return vgetq_lane_u64((uint64x2_t) in, imm);
+#endif
     switch (imm) {
     case 0:
         return vgetq_lane_u64((uint64x2_t) in, 0);
@@ -457,7 +458,6 @@ static really_inline u64a extract64from128(const m128 in, unsigned imm) {
 	return 0;
 	break;
     }
-#endif
 }
 
 static really_inline m128 low64from128(const m128 in) {

From 4589f1742e1ef24ea8e87a56a477e76a56358968 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Tue, 7 Dec 2021 08:49:59 +0000
Subject: [PATCH 17/17] minor fixes

---
 src/util/arch/arm/simd_utils.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h
index 764d26fd..902d3624 100644
--- a/src/util/arch/arm/simd_utils.h
+++ b/src/util/arch/arm/simd_utils.h
@@ -420,8 +420,9 @@ m128 load_m128_from_u64a(const u64a *p) {
 
 static really_inline u32 extract32from128(const m128 in, unsigned imm) {
 #if defined(HAVE__BUILTIN_CONSTANT_P)
-    if (__builtin_constant_p(b)) {
+    if (__builtin_constant_p(imm)) {
         return vgetq_lane_u32((uint32x4_t) in, imm);
+    }
 #endif
     switch (imm) {
     case 0:
@@ -444,8 +445,9 @@ static really_inline u32 extract32from128(const m128 in, unsigned imm) {
 
 static really_inline u64a extract64from128(const m128 in, unsigned imm) {
 #if defined(HAVE__BUILTIN_CONSTANT_P)
-    if (__builtin_constant_p(b)) {
+    if (__builtin_constant_p(imm)) {
         return vgetq_lane_u64((uint64x2_t) in, imm);
+    }
 #endif
     switch (imm) {
     case 0: