diff --git a/CMakeLists.txt b/CMakeLists.txt
index a741961c..76bca813 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,7 @@ project (vectorscan C CXX)
 
 set (HS_MAJOR_VERSION 5)
 set (HS_MINOR_VERSION 4)
-set (HS_PATCH_VERSION 3)
+set (HS_PATCH_VERSION 5)
 set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION})
 
 set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
@@ -128,11 +128,9 @@ CMAKE_DEPENDENT_OPTION(DUMP_SUPPORT "Dump code support; normally on, except in r
 
 CMAKE_DEPENDENT_OPTION(DISABLE_ASSERTS "Disable assert(); Asserts are enabled in debug builds, disabled in release builds" OFF "NOT RELEASE_BUILD" ON)
 
-option(BUILD_AVX512 "Experimental: support avx512 in the fat runtime"
-    OFF)
+option(BUILD_AVX512 "Experimental: support avx512 in the fat runtime" OFF)
 
-option(BUILD_AVX512VBMI "Experimental: support avx512vbmi in the fat runtime"
-    OFF)
+option(BUILD_AVX512VBMI "Experimental: support avx512vbmi in the fat runtime" OFF)
 
 if (BUILD_AVX512VBMI)
     set(BUILD_AVX512 ON)
@@ -140,47 +138,95 @@ endif ()
 
 # TODO: per platform config files?
 
-    # remove CMake's idea of optimisation
-    foreach (CONFIG ${CMAKE_BUILD_TYPE} ${CMAKE_CONFIGURATION_TYPES})
-        string(REGEX REPLACE "-O[^ ]*" "" CMAKE_C_FLAGS_${CONFIG} "${CMAKE_C_FLAGS_${CONFIG}}")
-        string(REGEX REPLACE "-O[^ ]*" "" CMAKE_CXX_FLAGS_${CONFIG} "${CMAKE_CXX_FLAGS_${CONFIG}}")
-    endforeach ()
+# remove CMake's idea of optimisation
+foreach (CONFIG ${CMAKE_BUILD_TYPE} ${CMAKE_CONFIGURATION_TYPES})
+    string(REGEX REPLACE "-O[^ ]*" "" CMAKE_C_FLAGS_${CONFIG} "${CMAKE_C_FLAGS_${CONFIG}}")
+    string(REGEX REPLACE "-O[^ ]*" "" CMAKE_CXX_FLAGS_${CONFIG} "${CMAKE_CXX_FLAGS_${CONFIG}}")
+endforeach ()
 
-    if (CMAKE_COMPILER_IS_GNUCC AND NOT CROSS_COMPILE_AARCH64 AND NOT ARCH_PPC64EL)
-        message(STATUS "gcc version ${CMAKE_C_COMPILER_VERSION}")
-        # If gcc doesn't recognise the host cpu, then mtune=native becomes
-        # generic, which isn't very good in some cases. march=native looks at
-        # cpuid info and then chooses the best microarch it can (and replaces
-        # the flag), so use that for tune.
+if (CMAKE_C_COMPILER_ID MATCHES "Intel")
+    set(SKYLAKE_FLAG "-xCORE-AVX512")
+else ()
+    set(SKYLAKE_FLAG "-march=skylake-avx512")
+    set(ICELAKE_FLAG "-march=icelake-server")
+endif ()
 
-        # arg1 might exist if using ccache
-        string (STRIP "${CMAKE_C_COMPILER_ARG1}" CC_ARG1)
-        set (EXEC_ARGS ${CC_ARG1} -c -Q --help=target -march=native -mtune=native)
-        execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS}
-            OUTPUT_VARIABLE _GCC_OUTPUT)
-        string(FIND "${_GCC_OUTPUT}" "march" POS)
-        string(SUBSTRING "${_GCC_OUTPUT}" ${POS} -1 _GCC_OUTPUT)
-        string(REGEX REPLACE "march=[ \t]*([^ \n]*)[ \n].*" "\\1"
-            GNUCC_ARCH "${_GCC_OUTPUT}")
+if(ARCH_PPC64EL)
+    set(ARCH_FLAG mcpu)
+else()
+    set(ARCH_FLAG march)
+endif()
 
-        if (ARCH_IA32 OR ARCH_X86_64)
-            # test the parsed flag
-            set (EXEC_ARGS ${CC_ARG1} -E - -mtune=${GNUCC_ARCH})
-            execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS}
-                OUTPUT_QUIET ERROR_QUIET
-                INPUT_FILE /dev/null
-                RESULT_VARIABLE GNUCC_TUNE_TEST)
-            if (NOT GNUCC_TUNE_TEST EQUAL 0)
-                message(SEND_ERROR "Something went wrong determining gcc tune: -mtune=${GNUCC_ARCH} not valid")
-            endif()
-            set(TUNE_FLAG ${GNUCC_ARCH})
-        else()
-            set(TUNE_FLAG native)
-        endif()
-    elseif (NOT TUNE_FLAG)
+# Detect best GNUCC_ARCH to tune for
+if (CMAKE_COMPILER_IS_GNUCC AND NOT CROSS_COMPILE)
+    message(STATUS "gcc version ${CMAKE_C_COMPILER_VERSION}")
+
+    # If gcc doesn't recognise the host cpu, then mtune=native becomes
+    # generic, which isn't very good in some cases. march=native looks at
+    # cpuid info and then chooses the best microarch it can (and replaces
+    # the flag), so use that for tune.
+
+    # arg1 might exist if using ccache
+    string (STRIP "${CMAKE_C_COMPILER_ARG1}" CC_ARG1)
+    set (EXEC_ARGS ${CC_ARG1} -c -Q --help=target -${ARCH_FLAG}=native -mtune=native)
+    execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS}
+        OUTPUT_VARIABLE _GCC_OUTPUT)
+    string(FIND "${_GCC_OUTPUT}" "${ARCH_FLAG}" POS)
+    string(SUBSTRING "${_GCC_OUTPUT}" ${POS} -1 _GCC_OUTPUT)
+    string(REGEX REPLACE "${ARCH_FLAG}=[ \t]*([^ \n]*)[ \n].*" "\\1" GNUCC_ARCH "${_GCC_OUTPUT}")
+
+    # test the parsed flag
+    set (EXEC_ARGS ${CC_ARG1} -E - -mtune=${GNUCC_ARCH})
+    execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS}
+        OUTPUT_QUIET ERROR_QUIET
+        INPUT_FILE /dev/null
+        RESULT_VARIABLE GNUCC_TUNE_TEST)
+    if (NOT GNUCC_TUNE_TEST EQUAL 0)
+        message(WARNING "Something went wrong determining gcc tune: -mtune=${GNUCC_ARCH} not valid, falling back to -mtune=native")
         set(TUNE_FLAG native)
+    else()
+        set(TUNE_FLAG ${GNUCC_ARCH})
+        message(STATUS "gcc will tune for ${GNUCC_ARCH}, ${TUNE_FLAG}")
     endif()
+elseif (CMAKE_COMPILER_IS_CLANG AND NOT CROSS_COMPILE)
+    if (ARCH_IA32 OR ARCH_X86_64)
+        set(GNUCC_ARCH native)
+        set(TUNE_FLAG generic)
+    elseif(ARCH_AARCH64)
+       set(GNUCC_ARCH armv8)
+       set(TUNE_FLAG generic)
+    elseif(ARCH_ARM32)
+       set(GNUCC_ARCH armv7a)
+       set(TUNE_FLAG generic)
+    else()
+       set(GNUCC_ARCH native)
+       set(TUNE_FLAG generic)
+    endif()
+    message(STATUS "clang will tune for ${GNUCC_ARCH}, ${TUNE_FLAG}")
+elseif (CROSS_COMPILE)
+    set(GNUCC_ARCH generic)
+    set(TUNE_FLAG generic)
+endif()
 
+if (ARCH_IA32 OR ARCH_X86_64)
+    if (NOT FAT_RUNTIME)
+        if (BUILD_AVX512)
+            set(ARCH_C_FLAGS "${SKYLAKE_FLAG}")
+            set(ARCH_CXX_FLAGS "${SKYLAKE_FLAG}")
+        elseif (BUILD_AVX2)
+            set(ARCH_C_FLAGS "-mavx2")
+            set(ARCH_CXX_FLAGS "-mavx2")
+        else()
+            set(ARCH_C_FLAGS "-msse4.2")
+            set(ARCH_CXX_FLAGS "-msse4.2")
+        endif()
+    else()
+       set(ARCH_C_FLAGS "-msse4.2")
+       set(ARCH_CXX_FLAGS "-msse4.2")
+    endif()
+endif()
+
+if (ARCH_AARCH64)
     if (BUILD_SVE2_BITPERM)
         set(GNUCC_ARCH "${GNUCC_ARCH}+sve2-bitperm")
     elseif (BUILD_SVE2)
@@ -188,92 +234,89 @@ endif ()
     elseif (BUILD_SVE)
         set(GNUCC_ARCH "${GNUCC_ARCH}+sve")
     endif ()
+endif(ARCH_AARCH64)
 
-    # compiler version checks TODO: test more compilers
-    if (CMAKE_COMPILER_IS_GNUCXX)
-        set(GNUCXX_MINVER "4.8.1")
-        message(STATUS "g++ version ${CMAKE_CXX_COMPILER_VERSION}")
-        if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS GNUCXX_MINVER)
-            message(FATAL_ERROR "A minimum of g++ ${GNUCXX_MINVER} is required for C++11 support")
-        endif()
-    endif()
-
-    if(RELEASE_BUILD)
-        if (NOT CMAKE_BUILD_TYPE MATCHES MINSIZEREL)
-            set(OPT_C_FLAG "-O3")
-            set(OPT_CXX_FLAG "-O3")
-        else ()
-            set(OPT_C_FLAG "-Os")
-            set(OPT_CXX_FLAG "-Os")
-        endif ()
-    else()
-        set(OPT_C_FLAG "-O0")
-        set(OPT_CXX_FLAG "-O0")
-    endif(RELEASE_BUILD)
-
-    # set compiler flags - more are tested and added later
-    set(EXTRA_C_FLAGS "${OPT_C_FLAG} -std=c17 -Wall -Wextra -Wshadow -Wcast-qual -fno-strict-aliasing")
-    set(EXTRA_CXX_FLAGS "${OPT_CXX_FLAG} -std=c++17 -Wall -Wextra -Wshadow -Wswitch -Wreturn-type -Wcast-qual -Wno-deprecated -Wnon-virtual-dtor -fno-strict-aliasing -fno-new-ttp-matching")
-
-    if (NOT RELEASE_BUILD)
-        # -Werror is most useful during development, don't potentially break
-        # release builds
-        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Werror")
-        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Werror")
-    endif()
-
-    if (DISABLE_ASSERTS)
-        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -DNDEBUG")
-        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -DNDEBUG")
-    endif()
+set(ARCH_C_FLAGS "-${ARCH_FLAG}=${GNUCC_ARCH} -mtune=${TUNE_FLAG} ${ARCH_C_FLAGS}")
+set(ARCH_CXX_FLAGS "-${ARCH_FLAG}=${GNUCC_ARCH} -mtune=${TUNE_FLAG} ${ARCH_CXX_FLAGS}")
 
+#if (ARCH_IA32 OR ARCH_X86_64 OR ARCH_ARM32 OR ARCH_AARCH64)
+#    if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*)
+#        set(ARCH_C_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}")
+#    endif()
+#    if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*)
+#        set(ARCH_CXX_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}")
+#    endif()
+#endif()
     
-    if (ARCH_IA32 OR ARCH_X86_64 OR ARCH_ARM32 OR ARCH_AARCH64)
-	 if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*)
-            set(ARCH_C_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}")
-   	 endif()
-	 
-	 if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*)
-            set(ARCH_CXX_FLAGS "-march=${GNUCC_ARCH} -mtune=${TUNE_FLAG}")
-         endif()
-    endif()
-    
-    if(ARCH_PPC64EL)
-        if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*)
-            set(ARCH_C_FLAGS "-mtune=${TUNE_FLAG}")
-        endif()
-        if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*)
-            set(ARCH_CXX_FLAGS "-mtune=${TUNE_FLAG}")
-        endif()
-    endif()
+#if(ARCH_PPC64EL)
+#    if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*)
+#        set(ARCH_C_FLAGS "-mtune=${TUNE_FLAG}")
+#    endif()
+#    if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*)
+#        set(ARCH_CXX_FLAGS "-mtune=${TUNE_FLAG}")
+#    endif()
+#endif()
 
-    if(CMAKE_COMPILER_IS_GNUCC)
-        # spurious warnings?
-        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-array-bounds -Wno-maybe-uninitialized")
+# compiler version checks TODO: test more compilers
+if (CMAKE_COMPILER_IS_GNUCXX)
+    set(GNUCXX_MINVER "9")
+    message(STATUS "g++ version ${CMAKE_CXX_COMPILER_VERSION}")
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS GNUCXX_MINVER)
+        message(FATAL_ERROR "A minimum of g++ ${GNUCXX_MINVER} is required for C++17 support")
     endif()
+endif()
 
-    if(CMAKE_COMPILER_IS_GNUCXX)
-        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-maybe-uninitialized")
-        if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0)
-            set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fabi-version=0")
-        endif ()
-        # don't complain about abi
-        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-abi")
-        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-abi")
-    endif()
-
-    if (NOT(ARCH_IA32 AND RELEASE_BUILD))
-        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -fno-omit-frame-pointer")
-        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fno-omit-frame-pointer")
-    endif()
-
-
-    if (CMAKE_C_COMPILER_ID MATCHES "Intel")
-        set(SKYLAKE_FLAG "-xCORE-AVX512")
+if(RELEASE_BUILD)
+    if (NOT CMAKE_BUILD_TYPE MATCHES MINSIZEREL)
+        set(OPT_C_FLAG "-O3")
+        set(OPT_CXX_FLAG "-O3")
     else ()
-        set(SKYLAKE_FLAG "-march=skylake-avx512")
-        set(ICELAKE_FLAG "-march=icelake-server")
+        set(OPT_C_FLAG "-Os")
+        set(OPT_CXX_FLAG "-Os")
     endif ()
+else()
+    set(OPT_C_FLAG "-O0")
+    set(OPT_CXX_FLAG "-O0")
+endif(RELEASE_BUILD)
+
+# set compiler flags - more are tested and added later
+set(EXTRA_C_FLAGS "${OPT_C_FLAG} -std=c17 -Wall -Wextra -Wshadow -Wcast-qual -fno-strict-aliasing")
+set(EXTRA_CXX_FLAGS "${OPT_CXX_FLAG} -std=c++17 -Wall -Wextra -Wshadow -Wswitch -Wreturn-type -Wcast-qual -Wno-deprecated -Wnon-virtual-dtor -fno-strict-aliasing")
+if (NOT CMAKE_COMPILER_IS_CLANG)
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fno-new-ttp-matching")
+endif()
+
+if (NOT RELEASE_BUILD)
+    # -Werror is most useful during development, don't potentially break
+    # release builds
+    set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Werror")
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Werror")
+endif()
+
+if (DISABLE_ASSERTS)
+    set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -DNDEBUG")
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -DNDEBUG")
+endif()
+
+if(CMAKE_COMPILER_IS_GNUCC)
+    # spurious warnings?
+    set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-array-bounds -Wno-maybe-uninitialized")
+endif()
+
+if(CMAKE_COMPILER_IS_GNUCXX)
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-maybe-uninitialized")
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0)
+        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fabi-version=0")
+    endif ()
+    # don't complain about abi
+    set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-abi")
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-abi")
+endif()
+
+if (NOT(ARCH_IA32 AND RELEASE_BUILD))
+    set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -fno-omit-frame-pointer")
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fno-omit-frame-pointer")
+endif()
 
 CHECK_INCLUDE_FILES(unistd.h HAVE_UNISTD_H)
 if (ARCH_IA32 OR ARCH_X86_64)
@@ -289,8 +332,6 @@ elseif (ARCH_ARM32 OR ARCH_AARCH64)
       message(FATAL_ERROR "arm_sve.h is required to build for SVE.")
     endif()
   endif()
-  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -flax-vector-conversions")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -flax-vector-conversions")
 elseif (ARCH_PPC64EL)
   CHECK_INCLUDE_FILE_CXX(altivec.h HAVE_C_PPC64EL_ALTIVEC_H)
 endif()
@@ -318,8 +359,7 @@ if (CMAKE_SYSTEM_NAME MATCHES "Linux")
     # This is a Linux-only feature for now - requires platform support
     # elsewhere
     message(STATUS "generator is ${CMAKE_GENERATOR}")
-    if (CMAKE_C_COMPILER_ID MATCHES "Clang" AND
-        CMAKE_C_COMPILER_VERSION VERSION_LESS "3.9")
+    if (CMAKE_C_COMPILER_IS_CLANG AND CMAKE_C_COMPILER_VERSION VERSION_LESS "3.9")
         message (STATUS "Clang v3.9 or higher required for fat runtime, cannot build fat runtime")
         set (FAT_RUNTIME_REQUISITES FALSE)
     elseif (NOT (CMAKE_GENERATOR MATCHES "Unix Makefiles" OR
@@ -343,7 +383,10 @@ include (${CMAKE_MODULE_PATH}/arch.cmake)
 # testing a builtin takes a little more work
 CHECK_C_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CC_BUILTIN_ASSUME_ALIGNED)
 CHECK_CXX_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CXX_BUILTIN_ASSUME_ALIGNED)
-CHECK_C_SOURCE_COMPILES("int main(void) { __builtin_constant_p(0); }" HAVE__BUILTIN_CONSTANT_P)
+# Clang does not use __builtin_constant_p() the same way as gcc
+if (NOT CMAKE_COMPILER_IS_CLANG)
+   CHECK_C_SOURCE_COMPILES("int main(void) { __builtin_constant_p(0); }" HAVE__BUILTIN_CONSTANT_P)
+endif()
 
 set(C_FLAGS_TO_CHECK
 # Variable length arrays are way bad, most especially at run time
@@ -442,19 +485,22 @@ if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
     set(FREEBSD true)
 endif(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
 
-if (NOT FAT_RUNTIME)
-    if (CROSS_COMPILE_AARCH64)
+
+if (FAT_RUNTIME)
+    if (NOT (ARCH_IA32 OR ARCH_X86_64))
+        message(FATAL_ERROR "Fat runtime is not supported on non-Intel architectures")
+    else()
+        message(STATUS "Building runtime for multiple microarchitectures")
+    endif()
+else()
+    if (CROSS_COMPILE)
         message(STATUS "Building for target CPU: ${ARCH_C_FLAGS}")
     else()
         message(STATUS "Building for current host CPU: ${ARCH_C_FLAGS}")
     endif()
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ARCH_C_FLAGS}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_CXX_FLAGS}")
-else()
-    message(STATUS "Building runtime for multiple microarchitectures")
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 endif()
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ARCH_C_FLAGS}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_CXX_FLAGS}")
 
 add_subdirectory(util)
 add_subdirectory(doc/dev-reference)
@@ -1171,10 +1217,6 @@ if (NOT FAT_RUNTIME)
         set_target_properties(hs_runtime PROPERTIES LINKER_LANGUAGE C)
 
         add_library(hs_compile OBJECT ${hs_compile_SRCS})
-	if (ARCH_IA32)
-            set_target_properties(hs_compile PROPERTIES COMPILE_FLAGS "-mssse3")
-        endif (ARCH_IA32)
-
         add_library(hs STATIC
             src/hs_version.c
             src/hs_valid_platform.c
@@ -1205,14 +1247,14 @@ else (FAT_RUNTIME)
         add_library(hs_exec_core2 OBJECT ${hs_exec_SRCS})
         list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_core2>)
         set_target_properties(hs_exec_core2 PROPERTIES
-            COMPILE_FLAGS "-march=core2"
+            COMPILE_FLAGS "-march=core2 -msse4.2"
             RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} core2 ${CMAKE_MODULE_PATH}/keep.syms.in"
             )
 
         add_library(hs_exec_corei7 OBJECT ${hs_exec_SRCS})
         list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_corei7>)
         set_target_properties(hs_exec_corei7 PROPERTIES
-            COMPILE_FLAGS "-march=corei7 -mssse3"
+            COMPILE_FLAGS "-march=corei7 -msse4.2"
             RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} corei7 ${CMAKE_MODULE_PATH}/keep.syms.in"
             )
 
@@ -1254,10 +1296,6 @@ else (FAT_RUNTIME)
             ${RUNTIME_LIBS})
         set_target_properties(hs_runtime PROPERTIES LINKER_LANGUAGE C)
         add_library(hs_compile OBJECT ${hs_compile_SRCS})
-        if (ARCH_IA32 OR ARCH_X86_64)
-            set_target_properties(hs_exec_common PROPERTIES COMPILE_FLAGS "-mssse3")
-            set_target_properties(hs_compile PROPERTIES COMPILE_FLAGS "-mssse3")
-        endif ()
 
         # we want the static lib for testing
         add_library(hs STATIC src/hs_version.c src/hs_valid_platform.c
@@ -1274,14 +1312,14 @@ else (FAT_RUNTIME)
         add_library(hs_exec_shared_core2 OBJECT ${hs_exec_SRCS})
         list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_core2>)
         set_target_properties(hs_exec_shared_core2 PROPERTIES
-            COMPILE_FLAGS "-march=core2"
+            COMPILE_FLAGS "-march=core2 -msse4.2"
             POSITION_INDEPENDENT_CODE TRUE
             RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} core2 ${CMAKE_MODULE_PATH}/keep.syms.in"
             )
         add_library(hs_exec_shared_corei7 OBJECT ${hs_exec_SRCS})
         list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_corei7>)
         set_target_properties(hs_exec_shared_corei7 PROPERTIES
-            COMPILE_FLAGS "-march=corei7 -mssse3"
+            COMPILE_FLAGS "-march=corei7 -msse4.2"
             POSITION_INDEPENDENT_CODE TRUE
             RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} corei7 ${CMAKE_MODULE_PATH}/keep.syms.in"
             )
diff --git a/Jenkinsfile b/Jenkinsfile
index 1883f43a..3dbef5b6 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -1,22 +1,590 @@
 pipeline {
-  agent {
-    node {
-      label 'x86'
-    }
-
-  }
-  stages {
-    stage('Release, SSE') {
-      agent {
-        node {
-          label 'x86'
+    agent none
+    stages {
+        stage("Build") {
+            failFast true
+            parallel {
+                stage("Release/SSE") {
+                    agent { label "x86" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-release-SSE', buildType: 'Release', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=no -DBUILD_AVX512=no -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-release-SSE/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-release-SSE/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Release/AVX2") {
+                    agent { label "x86" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-release-AVX2', buildType: 'Release', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=no -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-release-AVX2/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-release-AVX2/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Release/AVX512") {
+                    agent { label "x86" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-release-AVX512', buildType: 'Release', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=yes -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-release-AVX512/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-release-AVX512/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Release/FAT") {
+                    agent { label "x86" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-release-fat', buildType: 'Release', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=yes -DFAT_RUNTIME=yes', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-release-fat/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Debug/SSE") {
+                    agent { label "x86" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-debug-SSE', buildType: 'Debug', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=no -DBUILD_AVX512=no -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-debug-SSE/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-debug-SSE/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Debug/AVX2") {
+                    agent { label "x86" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-debug-AVX2', buildType: 'Debug', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=no -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-debug-AVX2/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-debug-AVX2/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Debug/AVX512") {
+                    agent { label "x86" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-debug-AVX512', buildType: 'Debug', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=yes -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-debug-AVX512/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-debug-AVX512/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Debug/FAT") {
+                    agent { label "x86" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-debug-fat', buildType: 'Debug', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=yes -DFAT_RUNTIME=yes', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-debug-fat/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Release/ARM") {
+                    agent { label "arm" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-release-arm', buildType: 'Release', cleanBuild: true, cmakeArgs: '', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-release-arm/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-release-arm/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Debug/ARM") {
+                    agent { label "arm" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-debug-arm', buildType: 'Debug', cleanBuild: true, cmakeArgs: '', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-debug-arm/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-debug-arm/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Release/Power") {
+                    agent { label "power" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-release-power', buildType: 'Release', cleanBuild: true, cmakeArgs: '', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-release-power/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-release-power/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Debug/Power") {
+                    agent { label "power" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-debug-power', buildType: 'Debug', cleanBuild: true, cmakeArgs: '', installation: 'InSearchPath', steps: [[args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-debug-power/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-debug-power/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Clang-Release/SSE") {
+                    agent { label "x86" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-clang-release-SSE', buildType: 'Release', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=no -DBUILD_AVX512=no -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-clang-release-SSE/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-clang-release-SSE/bin/unit-hyperscan'
+                            }
+                        }
+                    }
+                }
+                stage("Clang-Release/AVX2") {
+                    agent { label "x86" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-clang-release-AVX2', buildType: 'Release', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=no -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-clang-release-AVX2/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-clang-release-AVX2/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Clang-Release/AVX512") {
+                    agent { label "x86" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-clang-release-AVX512', buildType: 'Release', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=yes -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-clang-release-AVX512/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-clang-release-AVX512/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Clang-Release/FAT") {
+                    agent { label "x86" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-clang-release-fat', buildType: 'Release', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=yes -DFAT_RUNTIME=yes', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-clang-release-fat/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Clang-Debug/SSE") {
+                    agent { label "x86" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-clang-debug-SSE', buildType: 'Debug', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=no -DBUILD_AVX512=no -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-clang-debug-SSE/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-clang-debug-SSE/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Clang-Debug/AVX2") {
+                    agent { label "x86" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-clang-debug-AVX2', buildType: 'Debug', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=no -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-clang-debug-AVX2/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-clang-debug-AVX2/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Clang-Debug/AVX512") {
+                    agent { label "x86" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-clang-debug-AVX512', buildType: 'Debug', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=yes -DFAT_RUNTIME=no', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-clang-debug-AVX512/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-clang-debug-AVX512/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Clang-Debug/FAT") {
+                    agent { label "x86" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-clang-debug-fat', buildType: 'Debug', cleanBuild: true, cmakeArgs: '-DBUILD_AVX2=yes -DBUILD_AVX512=yes -DFAT_RUNTIME=yes', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-clang-debug-fat/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Clang-Release/ARM") {
+                    agent { label "arm" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-clang-release-arm', buildType: 'Release', cleanBuild: true, cmakeArgs: '', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-clang-release-arm/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-clang-release-arm/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Clang-Debug/ARM") {
+                    agent { label "arm" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-debug-arm', buildType: 'Debug', cleanBuild: true, cmakeArgs: '', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-clang-debug-arm/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-clang-debug-arm/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Clang-Release/Power") {
+                    agent { label "power" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-clang-release-power', buildType: 'Release', cleanBuild: true, cmakeArgs: '', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-clang-release-power/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-clang-release-power/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+                stage("Clang-Debug/Power") {
+                    agent { label "power" }
+                    stages {
+                        stage("Git checkout") {
+                            steps {
+                                checkout([$class: 'GitSCM', branches: [[name: '${sha1}']], extensions: [], userRemoteConfigs: [[refspec: '+refs/pull/${ghprbPullId}/*:refs/remotes/origin/pr/${ghprbPullId}/*', url: 'https://github.com/VectorCamp/vectorscan.git']]])
+                            }
+                        } 
+                        stage("Build") {
+                            steps {
+                                cmakeBuild buildDir: 'build-clang-debug-power', buildType: 'Debug', cleanBuild: true, cmakeArgs: '', installation: 'InSearchPath', steps: [[envVars: 'CC=clang CXX=clang++', args: '--parallel 4', withCmake: true]]
+                            }
+                        }
+                        stage("Unit Test") {
+                            steps {
+                                sh 'build-clang-debug-power/bin/unit-internal'
+                            }
+                        }
+                        stage("Test") {
+                            steps {
+                                sh 'build-clang-debug-power/bin/unit-hyperscan'
+                            }
+                        }
+                    } 
+                }
+            }
         }
-
-      }
-      steps {
-        sh 'mkdir build-release-SSE &&  cmake -DCMAKE_BUILD_TYPE=Release   -C build-release-SSE'
-      }
     }
-
-  }
-}
\ No newline at end of file
+}
diff --git a/cmake/arch.cmake b/cmake/arch.cmake
index 2100799f..29c39b49 100644
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@@ -88,7 +88,7 @@ if (FAT_RUNTIME)
             set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} ${SKYLAKE_FLAG}")
         endif (BUILD_AVX512VBMI)
     elseif (BUILD_AVX2)
-        set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} -march=core-avx2 -mavx")
+        set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} -march=core-avx2 -mavx2")
     elseif ()
         set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} -march=core-i7 -mssse3")
     endif ()
@@ -98,12 +98,12 @@ else (NOT FAT_RUNTIME)
 endif ()
 
 if (ARCH_IA32 OR ARCH_X86_64)
-    # ensure we have the minimum of SSSE3 - call a SSSE3 intrinsic
+    # ensure we have the minimum of SSE4.2 - call a SSE4.2 intrinsic
     CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
 int main() {
     __m128i a = _mm_set1_epi8(1);
     (void)_mm_shuffle_epi8(a, a);
-}" HAVE_SSSE3)
+}" HAVE_SSE42)
 
     # now look for AVX2
     CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
@@ -157,8 +157,8 @@ else ()
 endif ()
 
 if (FAT_RUNTIME)
-    if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_SSSE3)
-        message(FATAL_ERROR "SSSE3 support required to build fat runtime")
+    if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_SSE42)
+        message(FATAL_ERROR "SSE4.2 support required to build fat runtime")
     endif ()
     if ((ARCH_IA32 OR ARCH_X86_64) AND BUILD_AVX2 AND NOT HAVE_AVX2)
         message(FATAL_ERROR "AVX2 support required to build fat runtime")
@@ -179,8 +179,8 @@ else (NOT FAT_RUNTIME)
     if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_AVX512VBMI)
         message(STATUS "Building without AVX512VBMI support")
     endif ()
-    if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_SSSE3)
-        message(FATAL_ERROR "A minimum of SSSE3 compiler support is required")
+    if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_SSE42)
+        message(FATAL_ERROR "A minimum of SSE4.2 compiler support is required")
     endif ()
     if ((ARCH_ARM32 OR ARCH_AARCH64) AND NOT HAVE_NEON)
         message(FATAL_ERROR "NEON support required for ARM support")
diff --git a/cmake/platform.cmake b/cmake/platform.cmake
index 2cdc3a6e..5a2b85b2 100644
--- a/cmake/platform.cmake
+++ b/cmake/platform.cmake
@@ -1,3 +1,8 @@
+# determine compiler
+if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  set(CMAKE_COMPILER_IS_CLANG TRUE)
+endif()
+
 # determine the target arch
 
 if (CROSS_COMPILE_AARCH64)
@@ -10,7 +15,7 @@ else()
   CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_IA32)
   CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_A64)\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_AARCH64)
   CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_ARM)\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_ARM32)
-  CHECK_C_SOURCE_COMPILES("#if !defined(__PPC64__) && !defined(__LITTLE_ENDIAN__) && !defined(__VSX__)\n#error not ppc64el\n#endif\nint main(void) { return 0; }" ARCH_PPC64EL)
+  CHECK_C_SOURCE_COMPILES("#if !defined(__PPC64__) && !(defined(__LITTLE_ENDIAN__) && defined(__VSX__))\n#error not ppc64el\n#endif\nint main(void) { return 0; }" ARCH_PPC64EL)
   if (ARCH_X86_64 OR ARCH_AARCH64 OR ARCH_PPC64EL)
     set(ARCH_64_BIT TRUE)
   else()
diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h
index 4c68b485..902d3624 100644
--- a/src/util/arch/arm/simd_utils.h
+++ b/src/util/arch/arm/simd_utils.h
@@ -122,24 +122,252 @@ m128 sub_2x64(m128 a, m128 b) {
     return (m128) vsubq_u64((uint64x2_t)a, (uint64x2_t)b);
 }
 
-static really_really_inline
+static really_inline
 m128 lshift_m128(m128 a, unsigned b) {
-    return (m128) vshlq_n_u32((uint32x4_t)a, b);
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(b)) {
+        return (m128) vshlq_n_u32((uint32x4_t)a, b);
+    }
+#endif
+#define CASE_LSHIFT_m128(a, offset)  case offset: return (m128)vshlq_n_u32((uint32x4_t)(a), (offset)); break;
+    switch (b) {
+    case 0:  return a; break;
+    CASE_LSHIFT_m128(a,  1);
+    CASE_LSHIFT_m128(a,  2);
+    CASE_LSHIFT_m128(a,  3);
+    CASE_LSHIFT_m128(a,  4);
+    CASE_LSHIFT_m128(a,  5);
+    CASE_LSHIFT_m128(a,  6);
+    CASE_LSHIFT_m128(a,  7);
+    CASE_LSHIFT_m128(a,  8);
+    CASE_LSHIFT_m128(a,  9);
+    CASE_LSHIFT_m128(a, 10);
+    CASE_LSHIFT_m128(a, 11);
+    CASE_LSHIFT_m128(a, 12);
+    CASE_LSHIFT_m128(a, 13);
+    CASE_LSHIFT_m128(a, 14);
+    CASE_LSHIFT_m128(a, 15);
+    CASE_LSHIFT_m128(a, 16);
+    CASE_LSHIFT_m128(a, 17);
+    CASE_LSHIFT_m128(a, 18);
+    CASE_LSHIFT_m128(a, 19);
+    CASE_LSHIFT_m128(a, 20);
+    CASE_LSHIFT_m128(a, 21);
+    CASE_LSHIFT_m128(a, 22);
+    CASE_LSHIFT_m128(a, 23);
+    CASE_LSHIFT_m128(a, 24);
+    CASE_LSHIFT_m128(a, 25);
+    CASE_LSHIFT_m128(a, 26);
+    CASE_LSHIFT_m128(a, 27);
+    CASE_LSHIFT_m128(a, 28);
+    CASE_LSHIFT_m128(a, 29);
+    CASE_LSHIFT_m128(a, 30);
+    CASE_LSHIFT_m128(a, 31);
+    default: return zeroes128(); break;
+    }
+#undef CASE_LSHIFT_m128
 }
 
 static really_really_inline
 m128 rshift_m128(m128 a, unsigned b) {
-    return (m128) vshrq_n_u32((uint32x4_t)a, b);
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(b)) {
+        return (m128) vshrq_n_u32((uint32x4_t)a, b);
+    }
+#endif
+#define CASE_RSHIFT_m128(a, offset)  case offset: return (m128)vshrq_n_u32((uint32x4_t)(a), (offset)); break;
+    switch (b) {
+    case 0:  return a; break;
+    CASE_RSHIFT_m128(a,  1);
+    CASE_RSHIFT_m128(a,  2);
+    CASE_RSHIFT_m128(a,  3);
+    CASE_RSHIFT_m128(a,  4);
+    CASE_RSHIFT_m128(a,  5);
+    CASE_RSHIFT_m128(a,  6);
+    CASE_RSHIFT_m128(a,  7);
+    CASE_RSHIFT_m128(a,  8);
+    CASE_RSHIFT_m128(a,  9);
+    CASE_RSHIFT_m128(a, 10);
+    CASE_RSHIFT_m128(a, 11);
+    CASE_RSHIFT_m128(a, 12);
+    CASE_RSHIFT_m128(a, 13);
+    CASE_RSHIFT_m128(a, 14);
+    CASE_RSHIFT_m128(a, 15);
+    CASE_RSHIFT_m128(a, 16);
+    CASE_RSHIFT_m128(a, 17);
+    CASE_RSHIFT_m128(a, 18);
+    CASE_RSHIFT_m128(a, 19);
+    CASE_RSHIFT_m128(a, 20);
+    CASE_RSHIFT_m128(a, 21);
+    CASE_RSHIFT_m128(a, 22);
+    CASE_RSHIFT_m128(a, 23);
+    CASE_RSHIFT_m128(a, 24);
+    CASE_RSHIFT_m128(a, 25);
+    CASE_RSHIFT_m128(a, 26);
+    CASE_RSHIFT_m128(a, 27);
+    CASE_RSHIFT_m128(a, 28);
+    CASE_RSHIFT_m128(a, 29);
+    CASE_RSHIFT_m128(a, 30);
+    CASE_RSHIFT_m128(a, 31);
+    default: return zeroes128(); break;
+    }
+#undef CASE_RSHIFT_m128
 }
 
 static really_really_inline
 m128 lshift64_m128(m128 a, unsigned b) {
-    return (m128) vshlq_n_u64((uint64x2_t)a, b);
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(b)) {
+        return (m128) vshlq_n_u64((uint64x2_t)a, b);
+    }
+#endif
+#define CASE_LSHIFT64_m128(a, offset)  case offset: return (m128)vshlq_n_u64((uint64x2_t)(a), (offset)); break;
+    switch (b) {
+    case 0:  return a; break;
+    CASE_LSHIFT64_m128(a,  1);
+    CASE_LSHIFT64_m128(a,  2);
+    CASE_LSHIFT64_m128(a,  3);
+    CASE_LSHIFT64_m128(a,  4);
+    CASE_LSHIFT64_m128(a,  5);
+    CASE_LSHIFT64_m128(a,  6);
+    CASE_LSHIFT64_m128(a,  7);
+    CASE_LSHIFT64_m128(a,  8);
+    CASE_LSHIFT64_m128(a,  9);
+    CASE_LSHIFT64_m128(a, 10);
+    CASE_LSHIFT64_m128(a, 11);
+    CASE_LSHIFT64_m128(a, 12);
+    CASE_LSHIFT64_m128(a, 13);
+    CASE_LSHIFT64_m128(a, 14);
+    CASE_LSHIFT64_m128(a, 15);
+    CASE_LSHIFT64_m128(a, 16);
+    CASE_LSHIFT64_m128(a, 17);
+    CASE_LSHIFT64_m128(a, 18);
+    CASE_LSHIFT64_m128(a, 19);
+    CASE_LSHIFT64_m128(a, 20);
+    CASE_LSHIFT64_m128(a, 21);
+    CASE_LSHIFT64_m128(a, 22);
+    CASE_LSHIFT64_m128(a, 23);
+    CASE_LSHIFT64_m128(a, 24);
+    CASE_LSHIFT64_m128(a, 25);
+    CASE_LSHIFT64_m128(a, 26);
+    CASE_LSHIFT64_m128(a, 27);
+    CASE_LSHIFT64_m128(a, 28);
+    CASE_LSHIFT64_m128(a, 29);
+    CASE_LSHIFT64_m128(a, 30);
+    CASE_LSHIFT64_m128(a, 31);
+    CASE_LSHIFT64_m128(a, 32);
+    CASE_LSHIFT64_m128(a, 33);
+    CASE_LSHIFT64_m128(a, 34);
+    CASE_LSHIFT64_m128(a, 35);
+    CASE_LSHIFT64_m128(a, 36);
+    CASE_LSHIFT64_m128(a, 37);
+    CASE_LSHIFT64_m128(a, 38);
+    CASE_LSHIFT64_m128(a, 39);
+    CASE_LSHIFT64_m128(a, 40);
+    CASE_LSHIFT64_m128(a, 41);
+    CASE_LSHIFT64_m128(a, 42);
+    CASE_LSHIFT64_m128(a, 43);
+    CASE_LSHIFT64_m128(a, 44);
+    CASE_LSHIFT64_m128(a, 45);
+    CASE_LSHIFT64_m128(a, 46);
+    CASE_LSHIFT64_m128(a, 47);
+    CASE_LSHIFT64_m128(a, 48);
+    CASE_LSHIFT64_m128(a, 49);
+    CASE_LSHIFT64_m128(a, 50);
+    CASE_LSHIFT64_m128(a, 51);
+    CASE_LSHIFT64_m128(a, 52);
+    CASE_LSHIFT64_m128(a, 53);
+    CASE_LSHIFT64_m128(a, 54);
+    CASE_LSHIFT64_m128(a, 55);
+    CASE_LSHIFT64_m128(a, 56);
+    CASE_LSHIFT64_m128(a, 57);
+    CASE_LSHIFT64_m128(a, 58);
+    CASE_LSHIFT64_m128(a, 59);
+    CASE_LSHIFT64_m128(a, 60);
+    CASE_LSHIFT64_m128(a, 61);
+    CASE_LSHIFT64_m128(a, 62);
+    CASE_LSHIFT64_m128(a, 63);
+    default: return zeroes128(); break;
+    }
+#undef CASE_LSHIFT64_m128
 }
 
 static really_really_inline
 m128 rshift64_m128(m128 a, unsigned b) {
-    return (m128) vshrq_n_u64((uint64x2_t)a, b);
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(b)) {
+        return (m128) vshrq_n_u64((uint64x2_t)a, b);
+    }
+#endif
+#define CASE_RSHIFT64_m128(a, offset)  case offset: return (m128)vshrq_n_u64((uint64x2_t)(a), (offset)); break;
+    switch (b) {
+    case 0:  return a; break;
+    CASE_RSHIFT64_m128(a,  1);
+    CASE_RSHIFT64_m128(a,  2);
+    CASE_RSHIFT64_m128(a,  3);
+    CASE_RSHIFT64_m128(a,  4);
+    CASE_RSHIFT64_m128(a,  5);
+    CASE_RSHIFT64_m128(a,  6);
+    CASE_RSHIFT64_m128(a,  7);
+    CASE_RSHIFT64_m128(a,  8);
+    CASE_RSHIFT64_m128(a,  9);
+    CASE_RSHIFT64_m128(a, 10);
+    CASE_RSHIFT64_m128(a, 11);
+    CASE_RSHIFT64_m128(a, 12);
+    CASE_RSHIFT64_m128(a, 13);
+    CASE_RSHIFT64_m128(a, 14);
+    CASE_RSHIFT64_m128(a, 15);
+    CASE_RSHIFT64_m128(a, 16);
+    CASE_RSHIFT64_m128(a, 17);
+    CASE_RSHIFT64_m128(a, 18);
+    CASE_RSHIFT64_m128(a, 19);
+    CASE_RSHIFT64_m128(a, 20);
+    CASE_RSHIFT64_m128(a, 21);
+    CASE_RSHIFT64_m128(a, 22);
+    CASE_RSHIFT64_m128(a, 23);
+    CASE_RSHIFT64_m128(a, 24);
+    CASE_RSHIFT64_m128(a, 25);
+    CASE_RSHIFT64_m128(a, 26);
+    CASE_RSHIFT64_m128(a, 27);
+    CASE_RSHIFT64_m128(a, 28);
+    CASE_RSHIFT64_m128(a, 29);
+    CASE_RSHIFT64_m128(a, 30);
+    CASE_RSHIFT64_m128(a, 31);
+    CASE_RSHIFT64_m128(a, 32);
+    CASE_RSHIFT64_m128(a, 33);
+    CASE_RSHIFT64_m128(a, 34);
+    CASE_RSHIFT64_m128(a, 35);
+    CASE_RSHIFT64_m128(a, 36);
+    CASE_RSHIFT64_m128(a, 37);
+    CASE_RSHIFT64_m128(a, 38);
+    CASE_RSHIFT64_m128(a, 39);
+    CASE_RSHIFT64_m128(a, 40);
+    CASE_RSHIFT64_m128(a, 41);
+    CASE_RSHIFT64_m128(a, 42);
+    CASE_RSHIFT64_m128(a, 43);
+    CASE_RSHIFT64_m128(a, 44);
+    CASE_RSHIFT64_m128(a, 45);
+    CASE_RSHIFT64_m128(a, 46);
+    CASE_RSHIFT64_m128(a, 47);
+    CASE_RSHIFT64_m128(a, 48);
+    CASE_RSHIFT64_m128(a, 49);
+    CASE_RSHIFT64_m128(a, 50);
+    CASE_RSHIFT64_m128(a, 51);
+    CASE_RSHIFT64_m128(a, 52);
+    CASE_RSHIFT64_m128(a, 53);
+    CASE_RSHIFT64_m128(a, 54);
+    CASE_RSHIFT64_m128(a, 55);
+    CASE_RSHIFT64_m128(a, 56);
+    CASE_RSHIFT64_m128(a, 57);
+    CASE_RSHIFT64_m128(a, 58);
+    CASE_RSHIFT64_m128(a, 59);
+    CASE_RSHIFT64_m128(a, 60);
+    CASE_RSHIFT64_m128(a, 61);
+    CASE_RSHIFT64_m128(a, 62);
+    CASE_RSHIFT64_m128(a, 63);
+    default: return zeroes128(); break;
+    }
+#undef CASE_RSHIFT64_m128
 }
 
 static really_inline m128 eq128(m128 a, m128 b) {
@@ -191,9 +419,11 @@ m128 load_m128_from_u64a(const u64a *p) {
 }
 
 static really_inline u32 extract32from128(const m128 in, unsigned imm) {
-#if defined(HS_OPTIMIZE)
-    return vgetq_lane_u32((uint32x4_t) in, imm);
-#else
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(imm)) {
+        return vgetq_lane_u32((uint32x4_t) in, imm);
+    }
+#endif
     switch (imm) {
     case 0:
         return vgetq_lane_u32((uint32x4_t) in, 0);
@@ -211,13 +441,14 @@ static really_inline u32 extract32from128(const m128 in, unsigned imm) {
 	return 0;
 	break;
     }
-#endif
 }
 
 static really_inline u64a extract64from128(const m128 in, unsigned imm) {
-#if defined(HS_OPTIMIZE)
-    return vgetq_lane_u64((uint64x2_t) in, imm);
-#else
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(imm)) {
+        return vgetq_lane_u64((uint64x2_t) in, imm);
+    }
+#endif
     switch (imm) {
     case 0:
         return vgetq_lane_u64((uint64x2_t) in, 0);
@@ -229,7 +460,6 @@ static really_inline u64a extract64from128(const m128 in, unsigned imm) {
 	return 0;
 	break;
     }
-#endif
 }
 
 static really_inline m128 low64from128(const m128 in) {
diff --git a/src/util/arch/ppc64el/simd_types.h b/src/util/arch/ppc64el/simd_types.h
index 21dae5cb..8a5b0e25 100644
--- a/src/util/arch/ppc64el/simd_types.h
+++ b/src/util/arch/ppc64el/simd_types.h
@@ -30,7 +30,7 @@
 #define ARCH_PPC64EL_SIMD_TYPES_H
 
 #if !defined(m128) && defined(HAVE_VSX)
-typedef __vector int32_t m128;
+typedef __vector int m128;
 #endif
 
 #endif /* ARCH_PPC64EL_SIMD_TYPES_H  */
diff --git a/src/util/arch/ppc64el/simd_utils.h b/src/util/arch/ppc64el/simd_utils.h
index 137fc94f..d046ed47 100644
--- a/src/util/arch/ppc64el/simd_utils.h
+++ b/src/util/arch/ppc64el/simd_utils.h
@@ -43,6 +43,18 @@
 
 #include <string.h> // for memcpy
 
+typedef __vector unsigned long long int  uint64x2_t;
+typedef __vector   signed long long int   int64x2_t;
+typedef __vector unsigned int            uint32x4_t;
+typedef __vector   signed int             int32x4_t;
+typedef __vector unsigned short int      uint16x8_t;
+typedef __vector   signed short int       int16x8_t;
+typedef __vector unsigned char           uint8x16_t;
+typedef __vector  signed char             int8x16_t;
+
+typedef unsigned long long int ulong64_t;
+typedef   signed long long int  long64_t;
+/*
 typedef __vector  uint64_t uint64x2_t;
 typedef __vector   int64_t  int64x2_t;
 typedef __vector  uint32_t uint32x4_t;
@@ -50,7 +62,7 @@ typedef __vector   int32_t  int32x4_t;
 typedef __vector  uint16_t uint16x8_t;
 typedef __vector   int16_t  int16x8_t;
 typedef __vector   uint8_t uint8x16_t;
-typedef __vector    int8_t  int8x16_t;
+typedef __vector    int8_t  int8x16_t;*/
 
 
 #define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0
@@ -182,13 +194,13 @@ m128 rshift_m128(m128 a, unsigned b) {
 
 static really_really_inline
 m128 lshift64_m128(m128 a, unsigned b) {
-  uint64x2_t shift_indices = vec_splats((uint64_t)b); 
+  uint64x2_t shift_indices = vec_splats((ulong64_t)b); 
   return (m128) vec_sl((int64x2_t)a, shift_indices);
 }
 
 static really_really_inline
 m128 rshift64_m128(m128 a, unsigned  b) {
-  uint64x2_t shift_indices = vec_splats((uint64_t)b); 
+  uint64x2_t shift_indices = vec_splats((ulong64_t)b); 
   return (m128) vec_sr((int64x2_t)a, shift_indices);
 }
 
@@ -213,11 +225,11 @@ static really_inline u32 movemask128(m128 a) {
    uint32x4_t s3 = vec_or((uint32x4_t)ss2, res_and2);
    
    uint64x2_t ss3 = vec_sr((uint64x2_t)s3, (uint64x2_t)vec_splats(28));
-   uint64x2_t res_and3 = vec_and((uint64x2_t)s3, vec_splats((uint64_t)0xff));
+   uint64x2_t res_and3 = vec_and((uint64x2_t)s3, vec_splats((ulong64_t)0xff));
    uint64x2_t s4 = vec_or((uint64x2_t)ss3, res_and3);
   
    uint64x2_t ss4 = vec_sld((uint64x2_t)vec_splats(0), s4, 9);
-   uint64x2_t res_and4 = vec_and((uint64x2_t)s4, vec_splats((uint64_t)0xff));
+   uint64x2_t res_and4 = vec_and((uint64x2_t)s4, vec_splats((ulong64_t)0xff));
    uint64x2_t s5 = vec_or((uint64x2_t)ss4, res_and4);
  
    return s5[0];
diff --git a/src/util/arch/x86/simd_types.h b/src/util/arch/x86/simd_types.h
index c04e8dab..e1642404 100644
--- a/src/util/arch/x86/simd_types.h
+++ b/src/util/arch/x86/simd_types.h
@@ -30,7 +30,7 @@
 #ifndef SIMD_TYPES_X86_H
 #define SIMD_TYPES_X86_H
 
-#if !defined(m128) && defined(HAVE_SSE2)
+#if !defined(m128) && defined(HAVE_SSE42)
 typedef __m128i m128;
 #endif
 
diff --git a/src/util/simd_types.h b/src/util/simd_types.h
index 0deff7e5..4f0fd1a9 100644
--- a/src/util/simd_types.h
+++ b/src/util/simd_types.h
@@ -51,6 +51,7 @@ typedef struct ALIGN_AVX_DIRECTIVE {m128 lo; m128 hi;} m256;
 #endif
 
 typedef struct {m128 lo; m128 mid; m128 hi;} m384;
+
 #if !defined(m512) && !defined(HAVE_SIMD_512_BITS)
 typedef struct ALIGN_ATTR(64) {m256 lo; m256 hi;} m512;
 #endif
diff --git a/src/util/supervector/arch/arm/impl.cpp b/src/util/supervector/arch/arm/impl.cpp
index 980f0b39..ff1149a9 100644
--- a/src/util/supervector/arch/arm/impl.cpp
+++ b/src/util/supervector/arch/arm/impl.cpp
@@ -45,112 +45,112 @@ really_inline SuperVector<16>::SuperVector(typename base_type::type const v)
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<int8x16_t>(int8x16_t other)
+really_inline SuperVector<16>::SuperVector(int8x16_t other)
 {
     u.s8x16[0] = other;
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<uint8x16_t>(uint8x16_t other)
+really_inline SuperVector<16>::SuperVector(uint8x16_t other)
 {
     u.u8x16[0] = other;
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<int16x8_t>(int16x8_t other)
+really_inline SuperVector<16>::SuperVector(int16x8_t other)
 {
     u.s16x8[0] = other;
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<uint16x8_t>(uint16x8_t other)
+really_inline SuperVector<16>::SuperVector(uint16x8_t other)
 {
     u.u16x8[0] = other;
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<int32x4_t>(int32x4_t other)
+really_inline SuperVector<16>::SuperVector(int32x4_t other)
 {
     u.s32x4[0] = other;
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<uint32x4_t>(uint32x4_t other)
+really_inline SuperVector<16>::SuperVector(uint32x4_t other)
 {
     u.u32x4[0] = other;
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<int64x2_t>(int64x2_t other)
+really_inline SuperVector<16>::SuperVector(int64x2_t other)
 {
     u.s64x2[0] = other;
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<uint64x2_t>(uint64x2_t other)
+really_inline SuperVector<16>::SuperVector(uint64x2_t other)
 {
     u.u64x2[0] = other;
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<int8_t>(int8_t const other)
+really_inline SuperVector<16>::SuperVector(int8_t const other)
 {
     u.s8x16[0] = vdupq_n_s8(other);
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<uint8_t>(uint8_t const other)
+really_inline SuperVector<16>::SuperVector(uint8_t const other)
 {
     u.u8x16[0] = vdupq_n_u8(other);
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<int16_t>(int16_t const other)
+really_inline SuperVector<16>::SuperVector(int16_t const other)
 {
     u.s16x8[0] = vdupq_n_s16(other);
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<uint16_t>(uint16_t const other)
+really_inline SuperVector<16>::SuperVector(uint16_t const other)
 {
     u.u16x8[0] = vdupq_n_u16(other);
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<int32_t>(int32_t const other)
+really_inline SuperVector<16>::SuperVector(int32_t const other)
 {
     u.s32x4[0] = vdupq_n_s32(other);
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<uint32_t>(uint32_t const other)
+really_inline SuperVector<16>::SuperVector(uint32_t const other)
 {
     u.u32x4[0] = vdupq_n_u32(other);
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<int64_t>(int64_t const other)
+really_inline SuperVector<16>::SuperVector(int64_t const other)
 {
     u.s64x2[0] = vdupq_n_s64(other);
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<uint64_t>(uint64_t const other)
+really_inline SuperVector<16>::SuperVector(uint64_t const other)
 {
     u.u64x2[0] = vdupq_n_u64(other);
 }
@@ -376,7 +376,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_8  (uint8_t const N) const
     if (N == 0) return *this;
     if (N == 16) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u8(u.u8x16[0], n)}; });
+    Unroller<1, 8>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u8(v->u.u8x16[0], n)}; });
     return result;
 }
 
@@ -386,7 +386,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const N) const
     if (N == 0) return *this;
     if (N == 16) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u16(u.u16x8[0], n)}; });
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u16(v->u.u16x8[0], n)}; });
     return result;
 }
 
@@ -394,9 +394,9 @@ template <>
 really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const
 {
     if (N == 0) return *this;
-    if (N == 16) return Zeroes();
+    if (N == 32) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u32(u.u32x4[0], n)}; });
+    Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u32(v->u.u32x4[0], n)}; });
     return result;
 }
 
@@ -404,9 +404,9 @@ template <>
 really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const
 {
     if (N == 0) return *this;
-    if (N == 16) return Zeroes();
+    if (N == 64) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u64(u.u64x2[0], n)}; });
+    Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u64(v->u.u64x2[0], n)}; });
     return result;
 }
 
@@ -416,7 +416,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const
     if (N == 0) return *this;
     if (N == 16) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_u8(vdupq_n_u8(0), u.u8x16[0], 16 - n)}; });
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_u8(vdupq_n_u8(0), v->u.u8x16[0], 16 - n)}; });
     return result;
 }
 
@@ -430,9 +430,9 @@ template <>
 really_inline SuperVector<16> SuperVector<16>::vshr_8  (uint8_t const N) const
 {
     if (N == 0) return *this;
-    if (N == 16) return Zeroes();
+    if (N == 8) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u8(u.u8x16[0], n)}; });
+    Unroller<1, 8>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u8(v->u.u8x16[0], n)}; });
     return result;
 }
 
@@ -442,7 +442,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const
     if (N == 0) return *this;
     if (N == 16) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u16(u.u16x8[0], n)}; });
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u16(v->u.u16x8[0], n)}; });
     return result;
 }
 
@@ -450,9 +450,9 @@ template <>
 really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const
 {
     if (N == 0) return *this;
-    if (N == 16) return Zeroes();
+    if (N == 32) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u32(u.u32x4[0], n)}; });
+    Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u32(v->u.u32x4[0], n)}; });
     return result;
 }
 
@@ -460,9 +460,9 @@ template <>
 really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const
 {
     if (N == 0) return *this;
-    if (N == 16) return Zeroes();
+    if (N == 64) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u64(u.u64x2[0], n)}; });
+    Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u64(v->u.u64x2[0], n)}; });
     return result;
 }
 
@@ -472,7 +472,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const N) const
     if (N == 0) return *this;
     if (N == 16) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_u8(u.u8x16[0], vdupq_n_u8(0), n)}; });
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_u8(v->u.u8x16[0], vdupq_n_u8(0), n)}; });
     return result;
 }
 
diff --git a/src/util/supervector/arch/ppc64el/impl.cpp b/src/util/supervector/arch/ppc64el/impl.cpp
index e054e02e..109b8d5e 100644
--- a/src/util/supervector/arch/ppc64el/impl.cpp
+++ b/src/util/supervector/arch/ppc64el/impl.cpp
@@ -39,16 +39,6 @@
 #include "util/supervector/supervector.hpp"
 #include <iostream>
 
-
-typedef __vector uint64_t uint64x2_t;
-typedef __vector  int64_t  int64x2_t;
-typedef __vector uint32_t uint32x4_t;
-typedef __vector  int32_t  int32x4_t;
-typedef __vector uint16_t uint16x8_t;
-typedef __vector  int16_t  int16x8_t;
-typedef __vector  uint8_t uint8x16_t;
-typedef __vector   int8_t  int8x16_t;
-
 // 128-bit Powerpc64le implementation
 
 template<>
@@ -65,58 +55,58 @@ really_inline SuperVector<16>::SuperVector(typename base_type::type const v)
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<int8_t>(int8_t const other)
+really_inline SuperVector<16>::SuperVector(int8_t const other)
 {
     u.v128[0] = (m128) vec_splats(other);
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<uint8_t>(uint8_t const other)
+really_inline SuperVector<16>::SuperVector(uint8_t const other)
 {
     u.v128[0] = (m128) vec_splats(static_cast<uint8_t>(other));
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<int16_t>(int16_t const other)
+really_inline SuperVector<16>::SuperVector(int16_t const other)
 {
     u.v128[0] = (m128) vec_splats(other);
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<uint16_t>(uint16_t const other)
+really_inline SuperVector<16>::SuperVector(uint16_t const other)
 {
     u.v128[0] = (m128) vec_splats(static_cast<uint16_t>(other));
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<int32_t>(int32_t const other)
+really_inline SuperVector<16>::SuperVector(int32_t const other)
 {
     u.v128[0] = (m128) vec_splats(other);
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<uint32_t>(uint32_t const other)
+really_inline SuperVector<16>::SuperVector(uint32_t const other)
 {
     u.v128[0] = (m128) vec_splats(static_cast<uint32_t>(other));
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<int64_t>(int64_t const other)
+really_inline SuperVector<16>::SuperVector(int64_t const other)
 {
-    u.v128[0] = (m128) vec_splats(other);
+    u.v128[0] = (m128) vec_splats(static_cast<ulong64_t>(other));
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<uint64_t>(uint64_t const other)
+really_inline SuperVector<16>::SuperVector(uint64_t const other)
 {
-    u.v128[0] = (m128) vec_splats(static_cast<uint64_t>(other));
+    u.v128[0] = (m128) vec_splats(static_cast<ulong64_t>(other));
 }
 
 // Constants
@@ -229,11 +219,11 @@ really_inline typename SuperVector<16>::movemask_type SuperVector<16>::movemask(
     uint32x4_t s3 = vec_or((uint32x4_t)ss2, res_and2);
 
     uint64x2_t ss3 = vec_sr((uint64x2_t)s3, (uint64x2_t)vec_splats(28));
-    uint64x2_t res_and3 = vec_and((uint64x2_t)s3, vec_splats((uint64_t)0xff));
+    uint64x2_t res_and3 = vec_and((uint64x2_t)s3, vec_splats((ulong64_t)0xff));
     uint64x2_t s4 = vec_or((uint64x2_t)ss3, res_and3);
 
     uint64x2_t ss4 = vec_sld((uint64x2_t) vec_splats(0), s4, 9);
-    uint64x2_t res_and4 = vec_and((uint64x2_t)s4, vec_splats((uint64_t)0xff));
+    uint64x2_t res_and4 = vec_and((uint64x2_t)s4, vec_splats((ulong64_t)0xff));
     uint64x2_t s5 = vec_or((uint64x2_t)ss4, res_and4);
     
     return s5[0];
@@ -271,7 +261,7 @@ template <>
 template<uint8_t N>
 really_inline SuperVector<16> SuperVector<16>::vshl_64_imm() const
 {
-    return { (m128) vec_sl(u.s64x2[0], vec_splats((uint64_t)N)) };
+    return { (m128) vec_sl(u.s64x2[0], vec_splats((ulong64_t)N)) };
 }
 
 template <>
@@ -313,7 +303,7 @@ template <>
 template<uint8_t N>
 really_inline SuperVector<16> SuperVector<16>::vshr_64_imm() const
 {		 
-   return { (m128) vec_sr(u.s64x2[0], vec_splats((uint64_t)N)) }; 
+   return { (m128) vec_sr(u.s64x2[0], vec_splats((ulong64_t)N)) }; 
 }
 
 template <>
@@ -352,7 +342,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_8  (uint8_t const N) const
     if (N == 0) return *this;
     if (N == 16) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(u.s8x16[0], vec_splats((uint8_t)n))}; });
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s8x16[0], vec_splats((uint8_t)n))}; });
     return result;
 }
 
@@ -362,7 +352,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const UNUSED N)
     if (N == 0) return *this;
     if (N == 16) return Zeroes();
     SuperVector result; 
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(u.s16x8[0], vec_splats((uint16_t)n))}; });
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s16x8[0], vec_splats((uint16_t)n))}; });
     return result;
 }
 
@@ -372,7 +362,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const
     if (N == 0) return *this;
     if (N == 16) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(u.s32x4[0], vec_splats((uint32_t)n))}; });
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s32x4[0], vec_splats((uint32_t)n))}; });
     return result;
 }
 
@@ -382,7 +372,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const
     if (N == 0) return *this;
     if (N == 16) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(u.s64x2[0], vec_splats((uint64_t)n))}; });
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sl(v->u.s64x2[0], vec_splats((ulong64_t)n))}; });
     return result;
 }
 
@@ -392,7 +382,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const
     if (N == 0) return *this;
     if (N == 16) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sld(u.s8x16[0], (int8x16_t)vec_splat_s8(0), n)}; });
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sld(v->u.s8x16[0], (int8x16_t)vec_splat_s8(0), n)}; });
     return result;
 }
 
@@ -408,7 +398,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_8  (uint8_t const N) const
     if (N == 0) return *this;
     if (N == 16) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(u.s8x16[0], vec_splats((uint8_t)n))}; });
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s8x16[0], vec_splats((uint8_t)n))}; });
     return result;
 }
 
@@ -418,7 +408,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const
     if (N == 0) return *this;
     if (N == 16) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(u.s16x8[0], vec_splats((uint16_t)n))}; });
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s16x8[0], vec_splats((uint16_t)n))}; });
     return result;
 }
 
@@ -428,7 +418,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const
     if (N == 0) return *this;
     if (N == 16) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(u.s32x4[0], vec_splats((uint32_t)n))}; });
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s32x4[0], vec_splats((uint32_t)n))}; });
     return result;
 }
 
@@ -438,7 +428,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const
     if (N == 0) return *this;
     if (N == 16) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(u.s64x2[0], vec_splats((uint64_t)n))}; });
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sr(v->u.s64x2[0], vec_splats((ulong64_t)n))}; });
     return result;
 }
 
@@ -448,7 +438,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const UNUSED N)
     if (N == 0) return *this;
     if (N == 16) return Zeroes();
     SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sld((int8x16_t)vec_splat_u8(0), u.s8x16[0], 16 - n)}; });
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {(m128) vec_sld((int8x16_t)vec_splat_u8(0), v->u.s8x16[0], 16 - n)}; });
     return result;
 }
 
@@ -523,14 +513,14 @@ really_inline SuperVector<16> SuperVector<16>::Ones_vshl(uint8_t const N)
 template <>
 really_inline SuperVector<16> SuperVector<16>::loadu(void const *ptr)
 {
-    return (m128) vec_xl(0, (const int64_t*)ptr);
+    return (m128) vec_xl(0, (const long64_t*)ptr);
 }
 
 template <>
 really_inline SuperVector<16> SuperVector<16>::load(void const *ptr)
 {
     assert(ISALIGNED_N(ptr, alignof(SuperVector::size)));
-    return (m128)  vec_xl(0, (const int64_t*)ptr);
+    return (m128)  vec_xl(0, (const long64_t*)ptr);
 }
 
 template <>
diff --git a/src/util/supervector/arch/ppc64el/types.hpp b/src/util/supervector/arch/ppc64el/types.hpp
index dbd863f4..bdc6608e 100644
--- a/src/util/supervector/arch/ppc64el/types.hpp
+++ b/src/util/supervector/arch/ppc64el/types.hpp
@@ -27,6 +27,18 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
+typedef __vector unsigned long long int  uint64x2_t;
+typedef __vector   signed long long int   int64x2_t;
+typedef __vector unsigned int            uint32x4_t;
+typedef __vector   signed int             int32x4_t;
+typedef __vector unsigned short int      uint16x8_t;
+typedef __vector   signed short int       int16x8_t;
+typedef __vector unsigned char           uint8x16_t;
+typedef __vector  signed char             int8x16_t;
+
+typedef unsigned long long int ulong64_t;
+typedef   signed long long int  long64_t;
+
 #if !defined(m128) && defined(HAVE_VSX)
-typedef __vector int32_t m128;
+typedef __vector int m128;
 #endif
diff --git a/src/util/supervector/arch/x86/impl.cpp b/src/util/supervector/arch/x86/impl.cpp
index b7686220..157f1dc4 100644
--- a/src/util/supervector/arch/x86/impl.cpp
+++ b/src/util/supervector/arch/x86/impl.cpp
@@ -55,56 +55,56 @@ really_inline SuperVector<16>::SuperVector(typename base_type::type const v)
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<int8_t>(int8_t const other)
+really_inline SuperVector<16>::SuperVector(int8_t const other)
 {
     u.v128[0] = _mm_set1_epi8(other);
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<uint8_t>(uint8_t const other)
+really_inline SuperVector<16>::SuperVector(uint8_t const other)
 {
     u.v128[0] = _mm_set1_epi8(static_cast<int8_t>(other));
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<int16_t>(int16_t const other)
+really_inline SuperVector<16>::SuperVector(int16_t const other)
 {
     u.v128[0] = _mm_set1_epi16(other);
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<uint16_t>(uint16_t const other)
+really_inline SuperVector<16>::SuperVector(uint16_t const other)
 {
     u.v128[0] = _mm_set1_epi16(static_cast<int16_t>(other));
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<int32_t>(int32_t const other)
+really_inline SuperVector<16>::SuperVector(int32_t const other)
 {
     u.v128[0] = _mm_set1_epi32(other);
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<uint32_t>(uint32_t const other)
+really_inline SuperVector<16>::SuperVector(uint32_t const other)
 {
     u.v128[0] = _mm_set1_epi32(static_cast<int32_t>(other));
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<int64_t>(int64_t const other)
+really_inline SuperVector<16>::SuperVector(int64_t const other)
 {
     u.v128[0] = _mm_set1_epi64x(other);
 }
 
 template<>
 template<>
-really_inline SuperVector<16>::SuperVector<uint64_t>(uint64_t const other)
+really_inline SuperVector<16>::SuperVector(uint64_t const other)
 {
     u.v128[0] = _mm_set1_epi64x(static_cast<int64_t>(other));
 }
@@ -608,56 +608,56 @@ really_inline SuperVector<32>::SuperVector(SuperVector<16> const lo, SuperVector
 
 template<>
 template<>
-really_inline SuperVector<32>::SuperVector<int8_t>(int8_t const other)
+really_inline SuperVector<32>::SuperVector(int8_t const other)
 {
     u.v256[0] = _mm256_set1_epi8(other);
 }
 
 template<>
 template<>
-really_inline SuperVector<32>::SuperVector<uint8_t>(uint8_t const other)
+really_inline SuperVector<32>::SuperVector(uint8_t const other)
 {
     u.v256[0] = _mm256_set1_epi8(static_cast<int8_t>(other));
 }
 
 template<>
 template<>
-really_inline SuperVector<32>::SuperVector<int16_t>(int16_t const other)
+really_inline SuperVector<32>::SuperVector(int16_t const other)
 {
     u.v256[0] = _mm256_set1_epi16(other);
 }
 
 template<>
 template<>
-really_inline SuperVector<32>::SuperVector<uint16_t>(uint16_t const other)
+really_inline SuperVector<32>::SuperVector(uint16_t const other)
 {
     u.v256[0] = _mm256_set1_epi16(static_cast<int16_t>(other));
 }
 
 template<>
 template<>
-really_inline SuperVector<32>::SuperVector<int32_t>(int32_t const other)
+really_inline SuperVector<32>::SuperVector(int32_t const other)
 {
     u.v256[0] = _mm256_set1_epi32(other);
 }
 
 template<>
 template<>
-really_inline SuperVector<32>::SuperVector<uint32_t>(uint32_t const other)
+really_inline SuperVector<32>::SuperVector(uint32_t const other)
 {
     u.v256[0] = _mm256_set1_epi32(static_cast<int32_t>(other));
 }
 
 template<>
 template<>
-really_inline SuperVector<32>::SuperVector<int64_t>(int64_t const other)
+really_inline SuperVector<32>::SuperVector(int64_t const other)
 {
     u.v256[0] = _mm256_set1_epi64x(other);
 }
 
 template<>
 template<>
-really_inline SuperVector<32>::SuperVector<uint64_t>(uint64_t const other)
+really_inline SuperVector<32>::SuperVector(uint64_t const other)
 {
     u.v256[0] = _mm256_set1_epi64x(static_cast<int64_t>(other));
 }
@@ -804,7 +804,7 @@ really_inline SuperVector<32> SuperVector<32>::vshl_128_imm() const
 
 template <>
 template<uint8_t N>
-really_inline SuperVector<16> SuperVector<32>::vshl_256_imm() const
+really_inline SuperVector<32> SuperVector<32>::vshl_256_imm() const
 {
     if (N == 0) return *this;
     if (N == 16) return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0))};
@@ -950,11 +950,11 @@ really_inline SuperVector<32> SuperVector<32>::vshl_256(uint8_t const N) const
     SuperVector result;
     Unroller<1, 16>::iterator([&,v=this](auto const i) {
         constexpr uint8_t n = i.value;
-        if (N == n) result = {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 16 - n)};;
+        if (N == n) result = {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(v->u.v256[0], v->u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 16 - n)};;
     });
     Unroller<17, 32>::iterator([&,v=this](auto const i) {
         constexpr uint8_t n = i.value;
-        if (N == n) result = {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), n - 16)};
+        if (N == n) result = {_mm256_slli_si256(_mm256_permute2x128_si256(v->u.v256[0], v->u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), n - 16)};
     });
     return result;
 }
@@ -1240,56 +1240,56 @@ really_inline SuperVector<64>::SuperVector(m128 const v)
 
 template<>
 template<>
-really_inline SuperVector<64>::SuperVector<int8_t>(int8_t const o)
+really_inline SuperVector<64>::SuperVector(int8_t const o)
 {
     u.v512[0] = _mm512_set1_epi8(o);
 }
 
 template<>
 template<>
-really_inline SuperVector<64>::SuperVector<uint8_t>(uint8_t const o)
+really_inline SuperVector<64>::SuperVector(uint8_t const o)
 {
     u.v512[0] = _mm512_set1_epi8(static_cast<int8_t>(o));
 }
 
 template<>
 template<>
-really_inline SuperVector<64>::SuperVector<int16_t>(int16_t const o)
+really_inline SuperVector<64>::SuperVector(int16_t const o)
 {
     u.v512[0] = _mm512_set1_epi16(o);
 }
 
 template<>
 template<>
-really_inline SuperVector<64>::SuperVector<uint16_t>(uint16_t const o)
+really_inline SuperVector<64>::SuperVector(uint16_t const o)
 {
     u.v512[0] = _mm512_set1_epi16(static_cast<int16_t>(o));
 }
 
 template<>
 template<>
-really_inline SuperVector<64>::SuperVector<int32_t>(int32_t const o)
+really_inline SuperVector<64>::SuperVector(int32_t const o)
 {
     u.v512[0] = _mm512_set1_epi32(o);
 }
 
 template<>
 template<>
-really_inline SuperVector<64>::SuperVector<uint32_t>(uint32_t const o)
+really_inline SuperVector<64>::SuperVector(uint32_t const o)
 {
     u.v512[0] = _mm512_set1_epi32(static_cast<int32_t>(o));
 }
 
 template<>
 template<>
-really_inline SuperVector<64>::SuperVector<int64_t>(int64_t const o)
+really_inline SuperVector<64>::SuperVector(int64_t const o)
 {
     u.v512[0] = _mm512_set1_epi64(o);
 }
 
 template<>
 template<>
-really_inline SuperVector<64>::SuperVector<uint64_t>(uint64_t const o)
+really_inline SuperVector<64>::SuperVector(uint64_t const o)
 {
     u.v512[0] = _mm512_set1_epi64(static_cast<int64_t>(o));
 }
diff --git a/src/util/supervector/supervector.hpp b/src/util/supervector/supervector.hpp
index 737412f6..f0ddf63c 100644
--- a/src/util/supervector/supervector.hpp
+++ b/src/util/supervector/supervector.hpp
@@ -165,7 +165,7 @@ public:
     typename BaseVector<32>::type ALIGN_ATTR(BaseVector<32>::size) v256[SIZE / BaseVector<32>::size];
     typename BaseVector<64>::type ALIGN_ATTR(BaseVector<64>::size) v512[SIZE / BaseVector<64>::size];
 
-#if defined(ARCH_ARM32) || defined(ARCH_AARCH64)
+#if defined(ARCH_ARM32) || defined(ARCH_AARCH64) || defined(ARCH_PPC64EL)
     uint64x2_t ALIGN_ATTR(BaseVector<16>::size) u64x2[SIZE / BaseVector<16>::size];
     int64x2_t  ALIGN_ATTR(BaseVector<16>::size) s64x2[SIZE / BaseVector<16>::size];
     uint32x4_t ALIGN_ATTR(BaseVector<16>::size) u32x4[SIZE / BaseVector<16>::size];
@@ -176,17 +176,6 @@ public:
     int8x16_t  ALIGN_ATTR(BaseVector<16>::size) s8x16[SIZE / BaseVector<16>::size];
 #endif
 
-#if defined(ARCH_PPC64EL)
-    __vector uint64_t ALIGN_ATTR(BaseVector<16>::size) u64x2[SIZE / BaseVector<16>::size];
-    __vector int64_t  ALIGN_ATTR(BaseVector<16>::size) s64x2[SIZE / BaseVector<16>::size];
-    __vector uint32_t ALIGN_ATTR(BaseVector<16>::size) u32x4[SIZE / BaseVector<16>::size];
-    __vector int32_t  ALIGN_ATTR(BaseVector<16>::size) s32x4[SIZE / BaseVector<16>::size];
-    __vector uint16_t ALIGN_ATTR(BaseVector<16>::size) u16x8[SIZE / BaseVector<16>::size];
-    __vector int16_t  ALIGN_ATTR(BaseVector<16>::size) s16x8[SIZE / BaseVector<16>::size];
-    __vector uint8_t  ALIGN_ATTR(BaseVector<16>::size) u8x16[SIZE / BaseVector<16>::size];
-    __vector int8_t   ALIGN_ATTR(BaseVector<16>::size) s8x16[SIZE / BaseVector<16>::size];	
-#endif
-
     uint64_t u64[SIZE / sizeof(uint64_t)];
     int64_t  s64[SIZE / sizeof(int64_t)];
     uint32_t u32[SIZE / sizeof(uint32_t)];
@@ -200,7 +189,7 @@ public:
   } u;
 
   constexpr SuperVector() {};
-  constexpr SuperVector(SuperVector const &other)
+  SuperVector(SuperVector const &other)
   :u(other.u) {};
   SuperVector(typename base_type::type const v);
 
diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp
index 900078bb..bc2421dc 100644
--- a/unit/internal/simd_utils.cpp
+++ b/unit/internal/simd_utils.cpp
@@ -667,7 +667,7 @@ TEST(SimdUtilsTest, movq) {
     simd = _mm_set_epi64x(~0LL, 0x123456789abcdef);
 #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
     int64x2_t a = { 0x123456789abcdefLL, ~0LL };
-    simd = vreinterpretq_s64_s8(a);
+    simd = vreinterpretq_s32_s64(a);
 #elif defined(ARCH_PPC64EL)
     int64x2_t a = {0x123456789abcdefLL, ~0LL };
     simd = (m128) a;
diff --git a/util/CMakeLists.txt b/util/CMakeLists.txt
index 82cee0ff..ea942ef1 100644
--- a/util/CMakeLists.txt
+++ b/util/CMakeLists.txt
@@ -33,9 +33,6 @@ SET(corpusomatic_SRCS
     ng_find_matches.cpp
 )
 add_library(corpusomatic STATIC ${corpusomatic_SRCS})
-if (ARCH_IA32 OR ARCH_X86_64)
-    set_target_properties(corpusomatic PROPERTIES COMPILE_FLAGS "-mssse3")
-endif ()
 
 set(databaseutil_SRCS
     database_util.cpp