From 2d89df44ae9cd83a58ec949733a4a7bd1456a193 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <markos@freevec.org>
Date: Thu, 17 Sep 2020 19:00:48 +0300
Subject: [PATCH 01/53] move x86 arch and SIMD types to x86 arch folder

---
 src/util/arch.h                | 55 ++-----------------
 src/util/arch/x86/simd_types.h | 45 ++++++++++++++++
 src/util/arch/x86/x86.h        | 96 ++++++++++++++++++++++++++++++++++
 src/util/simd_types.h          | 16 +++---
 src/util/simd_utils.h          |  2 +-
 5 files changed, 152 insertions(+), 62 deletions(-)
 create mode 100644 src/util/arch/x86/simd_types.h
 create mode 100644 src/util/arch/x86/x86.h

diff --git a/src/util/arch.h b/src/util/arch.h
index 985fec6a..57e39c07 100644
--- a/src/util/arch.h
+++ b/src/util/arch.h
@@ -33,58 +33,9 @@
 #ifndef UTIL_ARCH_H_
 #define UTIL_ARCH_H_
 
-#if defined(__SSE2__) || defined(_M_X64) || (_M_IX86_FP >= 2)
-#define HAVE_SSE2
+#if defined(__i386__) || defined(__x86_64__)
+#include "util/arch/x86/x86.h"
 #endif
 
-#if defined(__SSE4_1__) || (defined(_WIN32) && defined(__AVX__))
-#define HAVE_SSE41
-#endif
+#endif // UTIL_ARCH_X86_H_
 
-#if defined(__SSE4_2__) || (defined(_WIN32) && defined(__AVX__))
-#define HAVE_SSE42
-#endif
-
-#if defined(__AVX__)
-#define HAVE_AVX
-#endif
-
-#if defined(__AVX2__)
-#define HAVE_AVX2
-#endif
-
-#if defined(__AVX512BW__)
-#define HAVE_AVX512
-#endif
-
-#if defined(__AVX512VBMI__)
-#define HAVE_AVX512VBMI
-#endif
-
-/*
- * ICC and MSVC don't break out POPCNT or BMI/2 as separate pre-def macros
- */
-#if defined(__POPCNT__) ||                                                     \
-    (defined(__INTEL_COMPILER) && defined(__SSE4_2__)) ||                      \
-    (defined(_WIN32) && defined(__AVX__))
-#define HAVE_POPCOUNT_INSTR
-#endif
-
-#if defined(__BMI__) || (defined(_WIN32) && defined(__AVX2__)) ||              \
-    (defined(__INTEL_COMPILER) && defined(__AVX2__))
-#define HAVE_BMI
-#endif
-
-#if defined(__BMI2__) || (defined(_WIN32) && defined(__AVX2__)) ||             \
-    (defined(__INTEL_COMPILER) && defined(__AVX2__))
-#define HAVE_BMI2
-#endif
-
-/*
- * MSVC uses a different form of inline asm
- */
-#if defined(_WIN32) && defined(_MSC_VER)
-#define NO_ASM
-#endif
-
-#endif // UTIL_ARCH_H_
diff --git a/src/util/arch/x86/simd_types.h b/src/util/arch/x86/simd_types.h
new file mode 100644
index 00000000..a582abd5
--- /dev/null
+++ b/src/util/arch/x86/simd_types.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SIMD_TYPES_X86_H
+#define SIMD_TYPES_X86_H
+
+#if !defined(m128) && defined(HAVE_SSE2)
+typedef __m128i m128;
+#endif
+
+#if !defined(m128) && defined(HAVE_AVX2)
+typedef __m256i m256;
+#endif
+
+#if !defined(m512) && defined(HAVE_AVX512)
+typedef __m512i m512;
+#endif
+
+#endif /* SIMD_TYPES_H */
+
diff --git a/src/util/arch/x86/x86.h b/src/util/arch/x86/x86.h
new file mode 100644
index 00000000..8126f14a
--- /dev/null
+++ b/src/util/arch/x86/x86.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2017-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Per-platform architecture definitions
+ */
+
+#ifndef UTIL_ARCH_X86_H_
+#define UTIL_ARCH_X86_H_
+
+#if defined(__SSE2__) || defined(_M_X64) || (_M_IX86_FP >= 2)
+#define HAVE_SSE2
+#define HAVE_SIMD_128_BITS
+#endif
+
+#if defined(__SSE4_1__) || (defined(_WIN32) && defined(__AVX__))
+#define HAVE_SSE41
+#define HAVE_SIMD_128_BITS
+#endif
+
+#if defined(__SSE4_2__) || (defined(_WIN32) && defined(__AVX__))
+#define HAVE_SSE42
+#define HAVE_SIMD_128_BITS
+#endif
+
+#if defined(__AVX__)
+#define HAVE_AVX
+#define HAVE_SIMD_256_BITS
+#endif
+
+#if defined(__AVX2__)
+#define HAVE_AVX2
+#define HAVE_SIMD_256_BITS
+#endif
+
+#if defined(__AVX512BW__)
+#define HAVE_AVX512
+#define HAVE_SIMD_512_BITS
+#endif
+
+#if defined(__AVX512VBMI__)
+#define HAVE_AVX512VBMI
+#endif
+
+/*
+ * ICC and MSVC don't break out POPCNT or BMI/2 as separate pre-def macros
+ */
+#if defined(__POPCNT__) ||                                                     \
+    (defined(__INTEL_COMPILER) && defined(__SSE4_2__)) ||                      \
+    (defined(_WIN32) && defined(__AVX__))
+#define HAVE_POPCOUNT_INSTR
+#endif
+
+#if defined(__BMI__) || (defined(_WIN32) && defined(__AVX2__)) ||              \
+    (defined(__INTEL_COMPILER) && defined(__AVX2__))
+#define HAVE_BMI
+#endif
+
+#if defined(__BMI2__) || (defined(_WIN32) && defined(__AVX2__)) ||             \
+    (defined(__INTEL_COMPILER) && defined(__AVX2__))
+#define HAVE_BMI2
+#endif
+
+/*
+ * MSVC uses a different form of inline asm
+ */
+#if defined(_WIN32) && defined(_MSC_VER)
+#define NO_ASM
+#endif
+
+#endif // UTIL_ARCH_X86_H_
diff --git a/src/util/simd_types.h b/src/util/simd_types.h
index 962cad6c..a58ede4d 100644
--- a/src/util/simd_types.h
+++ b/src/util/simd_types.h
@@ -34,22 +34,20 @@
 #include "util/intrinsics.h"
 #include "ue2common.h"
 
-#if defined(HAVE_SSE2)
-typedef __m128i m128;
-#else
+#if defined(__i386__) || defined(__x86_64__)
+#include "util/arch/x86/simd_types.h"
+#endif
+
+#if !defined(m128) && !defined(HAVE_SIMD_128_BITS)
 typedef struct ALIGN_DIRECTIVE {u64a hi; u64a lo;} m128;
 #endif
 
-#if defined(HAVE_AVX2)
-typedef __m256i m256;
-#else
+#if !defined(m256) && !defined(HAVE_SIMD_256_BITS)
 typedef struct ALIGN_AVX_DIRECTIVE {m128 lo; m128 hi;} m256;
 #endif
 
 typedef struct {m128 lo; m128 mid; m128 hi;} m384;
-#if defined(HAVE_AVX512)
-typedef __m512i m512;
-#else
+#if !defined(m512) && !defined(HAVE_SIMD_512_BITS)
 typedef struct ALIGN_ATTR(64) {m256 lo; m256 hi;} m512;
 #endif
 
diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h
index 42223133..671a5bab 100644
--- a/src/util/simd_utils.h
+++ b/src/util/simd_utils.h
@@ -38,10 +38,10 @@
 #endif
 
 #include "config.h"
+#include "util/arch.h"
 #include "ue2common.h"
 #include "simd_types.h"
 #include "unaligned.h"
-#include "util/arch.h"
 #include "util/intrinsics.h"
 
 #include <string.h> // for memcpy

From 6a407937197744252b1e90cc22245d8c9d8a80ae Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <markos@freevec.org>
Date: Thu, 17 Sep 2020 20:35:39 +0300
Subject: [PATCH 02/53] move cpuid stuff to util/arch/x86

---
 CMakeLists.txt                         | 4 ++--
 src/dispatcher.c                       | 4 +++-
 src/hs.cpp                             | 6 ++++--
 src/hs_valid_platform.c                | 6 ++++--
 src/util/{ => arch/x86}/cpuid_flags.c  | 0
 src/util/{ => arch/x86}/cpuid_flags.h  | 0
 src/util/{ => arch/x86}/cpuid_inline.h | 0
 src/util/target_info.cpp               | 4 +++-
 8 files changed, 16 insertions(+), 8 deletions(-)
 rename src/util/{ => arch/x86}/cpuid_flags.c (100%)
 rename src/util/{ => arch/x86}/cpuid_flags.h (100%)
 rename src/util/{ => arch/x86}/cpuid_inline.h (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 59c6e6e2..9cd6ad96 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -564,8 +564,8 @@ install(FILES ${hs_HEADERS} DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/hs")
 set (hs_exec_common_SRCS
     src/alloc.c
     src/scratch.c
-    src/util/cpuid_flags.c
-    src/util/cpuid_flags.h
+    src/util/arch/x86/cpuid_flags.c
+    src/util/arch/x86/cpuid_flags.h
     src/util/multibit.c
     )
 
diff --git a/src/dispatcher.c b/src/dispatcher.c
index a786b806..76ed37a1 100644
--- a/src/dispatcher.c
+++ b/src/dispatcher.c
@@ -30,7 +30,9 @@
 #include "hs_common.h"
 #include "hs_runtime.h"
 #include "ue2common.h"
-#include "util/cpuid_inline.h"
+#if defined(ARCH_X86_64)
+#include "util/arch/x86/cpuid_inline.h"
+#endif
 #include "util/join.h"
 
 #if defined(DISABLE_AVX512_DISPATCH)
diff --git a/src/hs.cpp b/src/hs.cpp
index ab54105c..a0cb9bb3 100644
--- a/src/hs.cpp
+++ b/src/hs.cpp
@@ -44,8 +44,10 @@
 #include "parser/prefilter.h"
 #include "parser/unsupported.h"
 #include "util/compile_error.h"
-#include "util/cpuid_flags.h"
-#include "util/cpuid_inline.h"
+#if defined(ARCH_X86_64)
+#include "util/arch/x86/cpuid_flags.h"
+#include "util/arch/x86/cpuid_inline.h"
+#endif
 #include "util/depth.h"
 #include "util/popcount.h"
 #include "util/target_info.h"
diff --git a/src/hs_valid_platform.c b/src/hs_valid_platform.c
index 59ad3f3a..7a022607 100644
--- a/src/hs_valid_platform.c
+++ b/src/hs_valid_platform.c
@@ -26,9 +26,11 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include "config.h"
 #include "hs_common.h"
-#include "util/cpuid_flags.h"
-#include "util/cpuid_inline.h"
+#if defined(ARCH_X86_64)
+#include "util/arch/x86/cpuid_inline.h"
+#endif
 
 HS_PUBLIC_API
 hs_error_t HS_CDECL hs_valid_platform(void) {
diff --git a/src/util/cpuid_flags.c b/src/util/arch/x86/cpuid_flags.c
similarity index 100%
rename from src/util/cpuid_flags.c
rename to src/util/arch/x86/cpuid_flags.c
diff --git a/src/util/cpuid_flags.h b/src/util/arch/x86/cpuid_flags.h
similarity index 100%
rename from src/util/cpuid_flags.h
rename to src/util/arch/x86/cpuid_flags.h
diff --git a/src/util/cpuid_inline.h b/src/util/arch/x86/cpuid_inline.h
similarity index 100%
rename from src/util/cpuid_inline.h
rename to src/util/arch/x86/cpuid_inline.h
diff --git a/src/util/target_info.cpp b/src/util/target_info.cpp
index 3a41e020..6eab701d 100644
--- a/src/util/target_info.cpp
+++ b/src/util/target_info.cpp
@@ -29,7 +29,9 @@
 
 #include "hs_compile.h" // for various hs_platform_info flags
 #include "target_info.h"
-#include "util/cpuid_flags.h"
+#if defined(ARCH_X86_64)
+#include "util/arch/x86/cpuid_flags.h"
+#endif
 
 namespace ue2 {
 

From ea721c908f9baa75c50320285f552ee995669191 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <markos@freevec.org>
Date: Fri, 18 Sep 2020 12:48:14 +0300
Subject: [PATCH 03/53] move crc32 SSE42 implementation to util/arch/x86

---
 src/crc32.c               | 49 +----------------------
 src/util/arch/x86/crc32.h | 82 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 83 insertions(+), 48 deletions(-)
 create mode 100644 src/util/arch/x86/crc32.h

diff --git a/src/crc32.c b/src/crc32.c
index 1dae47b4..19c7b7fa 100644
--- a/src/crc32.c
+++ b/src/crc32.c
@@ -30,7 +30,6 @@
 #include "config.h"
 #include "ue2common.h"
 #include "util/arch.h"
-#include "util/intrinsics.h"
 
 #if !defined(HAVE_SSE42)
 
@@ -579,53 +578,7 @@ u32 crc32c_sb8_64_bit(u32 running_crc, const unsigned char* p_buf,
 }
 
 #else // HAVE_SSE42
-
-#ifdef ARCH_64_BIT
-#define CRC_WORD 8
-#define CRC_TYPE u64a
-#define CRC_FUNC _mm_crc32_u64
-#else
-#define CRC_WORD 4
-#define CRC_TYPE u32
-#define CRC_FUNC _mm_crc32_u32
-#endif
-
-/*
- * Use the crc32 instruction from SSE4.2 to compute our checksum - same
- * polynomial as the above function.
- */
-static really_inline
-u32 crc32c_sse42(u32 running_crc, const unsigned char* p_buf,
-                      const size_t length) {
-    u32 crc = running_crc;
-
-    // Process byte-by-byte until p_buf is aligned
-
-    const unsigned char *aligned_buf = ROUNDUP_PTR(p_buf, CRC_WORD);
-    size_t init_bytes = aligned_buf - p_buf;
-    size_t running_length = ((length - init_bytes)/CRC_WORD)*CRC_WORD;
-    size_t end_bytes = length - init_bytes - running_length;
-
-    while (p_buf < aligned_buf) {
-        crc = _mm_crc32_u8(crc, *p_buf++);
-    }
-
-    // Main aligned loop, processes a word at a time.
-
-    for (size_t li = 0; li < running_length/CRC_WORD; li++) {
-        CRC_TYPE block = *(const CRC_TYPE *)p_buf;
-        crc = CRC_FUNC(crc, block);
-        p_buf += CRC_WORD;
-    }
-
-    // Remaining bytes
-
-    for(size_t li = 0; li < end_bytes; li++) {
-        crc = _mm_crc32_u8(crc, *p_buf++);
-    }
-
-    return crc;
-}
+#include "util/arch/x86/crc32.h"
 #endif
 
 #ifdef VERIFY_ASSERTION
diff --git a/src/util/arch/x86/crc32.h b/src/util/arch/x86/crc32.h
new file mode 100644
index 00000000..d5e7d424
--- /dev/null
+++ b/src/util/arch/x86/crc32.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef UTIL_ARCH_X86_CRC32_H_
+#define UTIL_ARCH_X86_CRC32_H_
+
+#include "util/arch/x86/x86.h"
+#include "util/intrinsics.h"
+
+#ifdef ARCH_64_BIT
+#define CRC_WORD 8
+#define CRC_TYPE u64a
+#define CRC_FUNC _mm_crc32_u64
+#else
+#define CRC_WORD 4
+#define CRC_TYPE u32
+#define CRC_FUNC _mm_crc32_u32
+#endif
+
+/*
+ * Use the crc32 instruction from SSE4.2 to compute our checksum - same
+ * polynomial as the above function.
+ */
+static really_inline
+u32 crc32c_sse42(u32 running_crc, const unsigned char* p_buf,
+                      const size_t length) {
+    u32 crc = running_crc;
+
+    // Process byte-by-byte until p_buf is aligned
+
+    const unsigned char *aligned_buf = ROUNDUP_PTR(p_buf, CRC_WORD);
+    size_t init_bytes = aligned_buf - p_buf;
+    size_t running_length = ((length - init_bytes)/CRC_WORD)*CRC_WORD;
+    size_t end_bytes = length - init_bytes - running_length;
+
+    while (p_buf < aligned_buf) {
+        crc = _mm_crc32_u8(crc, *p_buf++);
+    }
+
+    // Main aligned loop, processes a word at a time.
+
+    for (size_t li = 0; li < running_length/CRC_WORD; li++) {
+        CRC_TYPE block = *(const CRC_TYPE *)p_buf;
+        crc = CRC_FUNC(crc, block);
+        p_buf += CRC_WORD;
+    }
+
+    // Remaining bytes
+
+    for(size_t li = 0; li < end_bytes; li++) {
+        crc = _mm_crc32_u8(crc, *p_buf++);
+    }
+
+    return crc;
+}
+
+#endif // UTIL_ARCH_X86_CRC32_H_
\ No newline at end of file

From 956b001613ef301e9e5b2e2742c9bad3037ddaef Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <markos@freevec.org>
Date: Fri, 18 Sep 2020 12:51:39 +0300
Subject: [PATCH 04/53] move masked_move* AVX2 implementation to util/arch/x86

---
 src/util/{ => arch/x86}/masked_move.c | 0
 src/util/{ => arch/x86}/masked_move.h | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename src/util/{ => arch/x86}/masked_move.c (100%)
 rename src/util/{ => arch/x86}/masked_move.h (100%)

diff --git a/src/util/masked_move.c b/src/util/arch/x86/masked_move.c
similarity index 100%
rename from src/util/masked_move.c
rename to src/util/arch/x86/masked_move.c
diff --git a/src/util/masked_move.h b/src/util/arch/x86/masked_move.h
similarity index 100%
rename from src/util/masked_move.h
rename to src/util/arch/x86/masked_move.h

From 8ed5f4ac757b7eca7baf5dc58c3552f2bdc792c2 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <markos@freevec.org>
Date: Fri, 18 Sep 2020 12:55:57 +0300
Subject: [PATCH 05/53] fix include paths for masked_move

---
 CMakeLists.txt                  | 4 ++--
 src/hwlm/noodle_engine.c        | 5 ++++-
 src/util/arch/x86/masked_move.h | 6 +++---
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9cd6ad96..e5078848 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -694,7 +694,6 @@ set (hs_exec_SRCS
     src/util/exhaust.h
     src/util/fatbit.h
     src/util/join.h
-    src/util/masked_move.h
     src/util/multibit.h
     src/util/multibit.c
     src/util/multibit_compress.h
@@ -716,7 +715,8 @@ set (hs_exec_SRCS
 
 set (hs_exec_avx2_SRCS
     src/fdr/teddy_avx2.c
-    src/util/masked_move.c
+    src/util/arch/x86/masked_move.c
+    src/util/arch/x86/masked_move.h
 )
 
 
diff --git a/src/hwlm/noodle_engine.c b/src/hwlm/noodle_engine.c
index d4f6902a..da61dfe8 100644
--- a/src/hwlm/noodle_engine.c
+++ b/src/hwlm/noodle_engine.c
@@ -39,10 +39,13 @@
 #include "util/compare.h"
 #include "util/intrinsics.h"
 #include "util/join.h"
-#include "util/masked_move.h"
 #include "util/partial_store.h"
 #include "util/simd_utils.h"
 
+#if defined(HAVE_AVX2)
+#include "util/arch/x86/masked_move.h"
+#endif
+
 #include <ctype.h>
 #include <stdbool.h>
 #include <string.h>
diff --git a/src/util/arch/x86/masked_move.h b/src/util/arch/x86/masked_move.h
index 4c877ca9..c46ad144 100644
--- a/src/util/arch/x86/masked_move.h
+++ b/src/util/arch/x86/masked_move.h
@@ -29,12 +29,12 @@
 #ifndef MASKED_MOVE_H
 #define MASKED_MOVE_H
 
-#include "arch.h"
+#include "x86.h"
 
 #if defined(HAVE_AVX2)
 
-#include "unaligned.h"
-#include "simd_utils.h"
+#include "util/unaligned.h"
+#include "util/simd_utils.h"
 
 #ifdef __cplusplus
 extern "C" {

From aac1f0f1dc2bdbdf330198e84e972871371a5ab0 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <markos@freevec.org>
Date: Tue, 22 Sep 2020 11:02:07 +0300
Subject: [PATCH 06/53] move x86 bitutils.h implementations to
 util/arch/x86/bitutils.h

---
 src/util/arch/common/bitutils.h | 353 +++++++++++++++++++++++++++++
 src/util/arch/x86/bitutils.h    | 304 +++++++++++++++++++++++++
 src/util/bitutils.h             | 384 +++-----------------------------
 3 files changed, 688 insertions(+), 353 deletions(-)
 create mode 100644 src/util/arch/common/bitutils.h
 create mode 100644 src/util/arch/x86/bitutils.h

diff --git a/src/util/arch/common/bitutils.h b/src/util/arch/common/bitutils.h
new file mode 100644
index 00000000..85d5dc49
--- /dev/null
+++ b/src/util/arch/common/bitutils.h
@@ -0,0 +1,353 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Bit-twiddling primitives (ctz, compress etc)
+ */
+
+#ifndef BITUTILS_ARCH_COMMON_H
+#define BITUTILS_ARCH_COMMON_H
+
+#include "util/popcount.h"
+
+static really_inline
+u32 clz32_impl_c(u32 x) {
+    return (u32)__builtin_clz(x);
+}
+
+static really_inline
+u32 clz64_impl_c(u64a x) {
+    return (u32)__builtin_clzll(x);
+}
+
+// CTZ (count trailing zero) implementations.
+static really_inline
+u32 ctz32_impl_c(u32 x) {
+    return (u32)__builtin_ctz(x);
+}
+
+static really_inline
+u32 ctz64_impl_c(u64a x) {
+    return (u32)__builtin_ctzll(x);
+}
+
+static really_inline
+u32 lg2_impl_c(u32 x) {
+    if (!x) {
+        return 0;
+    }
+    return 31 - clz32_impl_c(x);
+}
+
+static really_inline
+u64a lg2_64_impl_c(u64a x) {
+    if (!x) {
+        return 0;
+    }
+    return 63 - clz64_impl_c(x);
+}
+
+static really_inline
+u32 findAndClearLSB_32_impl_c(u32 *v) {
+    u32 val = *v;
+    u32 offset = ctz32_impl_c(val);
+    *v = val & (val - 1);
+
+    assert(offset < 32);
+    return offset;
+}
+
+static really_inline
+u32 findAndClearLSB_64_impl_c(u64a *v) {
+#ifdef ARCH_64_BIT
+    // generic variant using gcc's builtin on 64-bit
+    u64a val = *v, offset;
+    offset = ctz64_impl_c(val);
+    *v = val & (val - 1);
+#else
+    // fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't
+    // inline calls to __builtin_ctzll
+    u32 v1 = (u32)*v;
+    u32 v2 = (u32)(*v >> 32);
+    u32 offset;
+    if (v1) {
+        offset = findAndClearLSB_32_impl_c(&v1);
+        *v = (u64a)v1 | ((u64a)v2 << 32);
+    } else {
+        offset = findAndClearLSB_32_impl_c(&v2) + 32;
+        *v = (u64a)v2 << 32;
+    }
+#endif
+
+    assert(offset < 64);
+    return (u32)offset;
+}
+
+static really_inline
+u32 findAndClearMSB_32_impl_c(u32 *v) {
+    u32 val = *v;
+    u32 offset = 31 - clz32_impl_c(val);
+    *v = val & ~(1 << offset);
+
+    assert(offset < 32);
+    return offset;
+}
+
+static really_inline
+u32 findAndClearMSB_64_impl_c(u64a *v) {
+#ifdef ARCH_64_BIT
+    // generic variant using gcc's builtin on 64-bit
+    u64a val = *v, offset;
+    offset = 63 - clz64_impl_c(val);
+    *v = val & ~(1ULL << offset);
+#else
+    // fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't
+    // inline calls to __builtin_ctzll
+    u32 v1 = (u32)*v;
+    u32 v2 = (*v >> 32);
+    u32 offset;
+    if (v2) {
+        offset = findAndClearMSB_32_impl_c(&v2) + 32;
+        *v = ((u64a)v2 << 32) | (u64a)v1;
+    } else {
+        offset = findAndClearMSB_32_impl_c(&v1);
+        *v = (u64a)v1;
+    }
+#endif
+
+    assert(offset < 64);
+    return (u32)offset;
+}
+
+static really_inline
+u32 compress32_impl_c(u32 x, u32 m) {
+
+    // Return zero quickly on trivial cases
+    if ((x & m) == 0) {
+        return 0;
+    }
+
+    u32 mk, mp, mv, t;
+
+    x &= m; // clear irrelevant bits
+
+    mk = ~m << 1; // we will count 0's to right
+    for (u32 i = 0; i < 5; i++) {
+        mp = mk ^ (mk << 1);
+        mp ^= mp << 2;
+        mp ^= mp << 4;
+        mp ^= mp << 8;
+        mp ^= mp << 16;
+
+        mv = mp & m; // bits to move
+        m = (m ^ mv) | (mv >> (1 << i)); // compress m
+        t = x & mv;
+        x = (x ^ t) | (t >> (1 << i)); // compress x
+        mk = mk & ~mp;
+    }
+
+    return x;
+}
+
+static really_inline
+u64a compress64_impl_c(u64a x, u64a m) {
+    // Return zero quickly on trivial cases
+    if ((x & m) == 0) {
+        return 0;
+    }
+
+    u64a mk, mp, mv, t;
+
+    x &= m; // clear irrelevant bits
+
+    mk = ~m << 1; // we will count 0's to right
+    for (u32 i = 0; i < 6; i++) {
+        mp = mk ^ (mk << 1);
+        mp ^= mp << 2;
+        mp ^= mp << 4;
+        mp ^= mp << 8;
+        mp ^= mp << 16;
+        mp ^= mp << 32;
+
+        mv = mp & m; // bits to move
+        m = (m ^ mv) | (mv >> (1 << i)); // compress m
+        t = x & mv;
+        x = (x ^ t) | (t >> (1 << i)); // compress x
+        mk = mk & ~mp;
+    }
+
+    return x;
+}
+
+static really_inline
+u32 expand32_impl_c(u32 x, u32 m) {
+    // Return zero quickly on trivial cases
+    if (!x || !m) {
+        return 0;
+    }
+
+    u32 m0, mk, mp, mv, t;
+    u32 array[5];
+
+    m0 = m; // save original mask
+    mk = ~m << 1; // we will count 0's to right
+
+    for (int i = 0; i < 5; i++) {
+        mp = mk ^ (mk << 1); // parallel suffix
+        mp = mp ^ (mp << 2);
+        mp = mp ^ (mp << 4);
+        mp = mp ^ (mp << 8);
+        mp = mp ^ (mp << 16);
+        mv = mp & m; // bits to move
+        array[i] = mv;
+        m = (m ^ mv) | (mv >> (1 << i)); // compress m
+        mk = mk & ~mp;
+    }
+
+    for (int i = 4; i >= 0; i--) {
+        mv = array[i];
+        t = x << (1 << i);
+        x = (x & ~mv) | (t & mv);
+    }
+
+    return x & m0; // clear out extraneous bits
+}
+
+static really_inline
+u64a expand64_impl_c(u64a x, u64a m) {
+
+    // Return zero quickly on trivial cases
+    if (!x || !m) {
+        return 0;
+    }
+
+    u64a m0, mk, mp, mv, t;
+    u64a array[6];
+
+    m0 = m; // save original mask
+    mk = ~m << 1; // we will count 0's to right
+
+    for (int i = 0; i < 6; i++) {
+        mp = mk ^ (mk << 1); // parallel suffix
+        mp = mp ^ (mp << 2);
+        mp = mp ^ (mp << 4);
+        mp = mp ^ (mp << 8);
+        mp = mp ^ (mp << 16);
+        mp = mp ^ (mp << 32);
+        mv = mp & m; // bits to move
+        array[i] = mv;
+        m = (m ^ mv) | (mv >> (1 << i)); // compress m
+        mk = mk & ~mp;
+    }
+
+    for (int i = 5; i >= 0; i--) {
+        mv = array[i];
+        t = x << (1 << i);
+        x = (x & ~mv) | (t & mv);
+    }
+
+    return x & m0; // clear out extraneous bits
+}
+
+
+/* returns the first set bit after begin (if not ~0U). If no bit is set after
+ * begin returns ~0U
+ */
+static really_inline
+u32 bf64_iterate_impl_c(u64a bitfield, u32 begin) {
+    if (begin != ~0U) {
+        /* switch off all bits at or below begin. Note: not legal to shift by
+         * by size of the datatype or larger. */
+        assert(begin <= 63);
+        bitfield &= ~((2ULL << begin) - 1);
+    }
+
+    if (!bitfield) {
+        return ~0U;
+    }
+
+    return ctz64_impl_c(bitfield);
+}
+
+static really_inline
+char bf64_set_impl_c(u64a *bitfield, u32 i) {
+    u64a mask = 1ULL << i;
+    char was_set = !!(*bitfield & mask);
+    *bitfield |= mask;
+
+    return was_set;
+}
+
+static really_inline
+void bf64_unset_impl_c(u64a *bitfield, u32 i) {
+    *bitfield &= ~(1ULL << i);
+}
+
+static really_inline
+u32 rank_in_mask32_impl_c(u32 mask, u32 bit) {
+    mask &= (u32)(1U << bit) - 1;
+    return popcount32(mask);
+}
+
+static really_inline
+u32 rank_in_mask64_impl_c(u64a mask, u32 bit) {
+    mask &= (u64a)(1ULL << bit) - 1;
+    return popcount64(mask);
+}
+
+static really_inline
+u32 pext32_impl_c(u32 x, u32 mask) {
+
+    u32 result = 0, num = 1;
+    while (mask != 0) {
+        u32 bit = findAndClearLSB_32_impl_c(&mask);
+        if (x & (1U << bit)) {
+            assert(num != 0); // more than 32 bits!
+            result |= num;
+        }
+        num <<= 1;
+    }
+    return result;
+}
+
+static really_inline
+u64a pext64_impl_c(u64a x, u64a mask) {
+
+    u32 result = 0, num = 1;
+    while (mask != 0) {
+        u32 bit = findAndClearLSB_64_impl_c(&mask);
+        if (x & (1ULL << bit)) {
+            assert(num != 0); // more than 32 bits!
+            result |= num;
+        }
+        num <<= 1;
+    }
+    return result;
+}
+
+#endif // BITUTILS_ARCH_COMMON_H
diff --git a/src/util/arch/x86/bitutils.h b/src/util/arch/x86/bitutils.h
new file mode 100644
index 00000000..da7c747e
--- /dev/null
+++ b/src/util/arch/x86/bitutils.h
@@ -0,0 +1,304 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Bit-twiddling primitives (ctz, compress etc)
+ */
+
+#ifndef BITUTILS_ARCH_X86_H
+#define BITUTILS_ARCH_X86_H
+
+#include "ue2common.h"
+#include "util/popcount.h"
+#include "util/arch.h"
+#include "util/intrinsics.h"
+
+#include "util/arch/common/bitutils.h"
+
+static really_inline
+u32 clz32_impl(u32 x) {
+#if defined(_WIN32)
+    unsigned long r;
+    _BitScanReverse(&r, x);
+    return 31 - r;
+#else
+    return clz32_impl_c(x);
+#endif
+}
+
+static really_inline
+u32 clz64_impl(u64a x) {
+#if defined(_WIN64)
+    unsigned long r;
+    _BitScanReverse64(&r, x);
+    return 63 - r;
+#elif defined(_WIN32)
+    unsigned long x1 = (u32)x;
+    unsigned long x2 = (u32)(x >> 32);
+    unsigned long r;
+    if (x2) {
+        _BitScanReverse(&r, x2);
+        return (u32)(31 - r);
+    }
+    _BitScanReverse(&r, (u32)x1);
+    return (u32)(63 - r);
+#else
+    return clz64_impl_c(x);
+#endif
+}
+
+// CTZ (count trailing zero) implementations.
+static really_inline
+u32 ctz32_impl(u32 x) {
+#if defined(_WIN32)
+    unsigned long r;
+    _BitScanForward(&r, x);
+    return r;
+#else
+    return ctz32_impl_c(x);
+#endif
+}
+
+static really_inline
+u32 ctz64_impl(u64a x) {
+#if defined(_WIN64)
+    unsigned long r;
+    _BitScanForward64(&r, x);
+    return r;
+#elif defined(_WIN32)
+    unsigned long r;
+    if (_BitScanForward(&r, (u32)x)) {
+        return (u32)r;
+    }
+    _BitScanForward(&r, x >> 32);
+    return (u32)(r + 32);
+#else
+    return ctz64_impl_c(x);
+#endif
+}
+
+static really_inline
+u32 lg2_impl(u32 x) {
+    return lg2_impl_c(x);
+}
+
+static really_inline
+u64a lg2_64_impl(u64a x) {
+    return lg2_64_impl_c(x);
+}
+
+static really_inline
+u32 findAndClearLSB_32_impl(u32 *v) {
+#ifndef NO_ASM
+    u32 val = *v, offset;
+    __asm__ ("bsf %1, %0\n"
+             "btr %0, %1\n"
+             : "=r" (offset), "=r" (val)
+             : "1" (val));
+    *v = val;
+
+    assert(offset < 32);
+    return offset;
+#else
+    return findAndClearLSB_32_impl_c(v);
+#endif
+
+}
+
+static really_inline
+u32 findAndClearLSB_64_impl(u64a *v) {
+#ifdef ARCH_64_BIT
+#if !defined(NO_ASM)
+    u64a val = *v, offset;
+    __asm__ ("bsfq %1, %0\n"
+             "btrq %0, %1\n"
+             : "=r" (offset), "=r" (val)
+             : "1" (val));
+    *v = val;
+#else
+    // generic variant using gcc's builtin on 64-bit
+    u64a val = *v, offset;
+    offset = ctz64(val);
+    *v = val & (val - 1);
+#endif // ARCH_X86_64
+    assert(offset < 64);
+    return (u32)offset;
+#else
+    return findAndClearLSB_64_impl_c(v);
+#endif
+}
+
+static really_inline
+u32 findAndClearMSB_32_impl(u32 *v) {
+#if !defined(NO_ASM)
+    u32 val = *v, offset;
+    __asm__ ("bsr %1, %0\n"
+             "btr %0, %1\n"
+             : "=r" (offset), "=r" (val)
+             : "1" (val));
+    *v = val;
+#else
+    u32 val = *v;
+    u32 offset = 31 - clz32_impl(val);
+    *v = val & ~(1 << offset);
+#endif
+    assert(offset < 32);
+    return offset;
+}
+
+static really_inline
+u32 findAndClearMSB_64_impl(u64a *v) {
+#ifdef ARCH_64_BIT
+#if !defined(NO_ASM)
+    u64a val = *v, offset;
+    __asm__ ("bsrq %1, %0\n"
+             "btrq %0, %1\n"
+             : "=r" (offset), "=r" (val)
+             : "1" (val));
+    *v = val;
+#else
+    // generic variant using gcc's builtin on 64-bit
+    u64a val = *v, offset;
+    offset = 63 - clz64_impl(val);
+    *v = val & ~(1ULL << offset);
+#endif // ARCH_X86_64
+    assert(offset < 64);
+    return (u32)offset;
+#else
+    return findAndClearMSB_64_impl_c(v);
+#endif
+}
+
+static really_inline
+u32 compress32_impl(u32 x, u32 m) {
+#if defined(HAVE_BMI2)
+    // BMI2 has a single instruction for this operation.
+    return _pext_u32(x, m);
+#else
+    return compress32_impl_c(x, m);
+#endif
+}
+
+static really_inline
+u64a compress64_impl(u64a x, u64a m) {
+#if defined(ARCH_X86_64) && defined(HAVE_BMI2)
+    // BMI2 has a single instruction for this operation.
+    return _pext_u64(x, m);
+#else
+    return compress64_impl_c(x, m);
+#endif
+}
+
+static really_inline
+u32 expand32_impl(u32 x, u32 m) {
+#if defined(HAVE_BMI2)
+    // BMI2 has a single instruction for this operation.
+    return _pdep_u32(x, m);
+#else
+    return expand32_impl_c(x, m);
+#endif
+}
+
+static really_inline
+u64a expand64_impl(u64a x, u64a m) {
+#if defined(ARCH_X86_64) && defined(HAVE_BMI2)
+    // BMI2 has a single instruction for this operation.
+    return _pdep_u64(x, m);
+#else
+    return expand64_impl_c(x, m);
+#endif
+}
+
+
+/* returns the first set bit after begin (if not ~0U). If no bit is set after
+ * begin returns ~0U
+ */
+static really_inline
+u32 bf64_iterate_impl(u64a bitfield, u32 begin) {
+    if (begin != ~0U) {
+        /* switch off all bits at or below begin. Note: not legal to shift by
+         * by size of the datatype or larger. */
+        assert(begin <= 63);
+        bitfield &= ~((2ULL << begin) - 1);
+    }
+
+    if (!bitfield) {
+        return ~0U;
+    }
+
+    return ctz64_impl(bitfield);
+}
+
+static really_inline
+char bf64_set_impl(u64a *bitfield, u32 i) {
+    return bf64_set_impl_c(bitfield, i);
+}
+
+static really_inline
+void bf64_unset_impl(u64a *bitfield, u32 i) {
+    return bf64_unset_impl_c(bitfield, i);
+}
+
+static really_inline
+u32 rank_in_mask32_impl(u32 mask, u32 bit) {
+    return rank_in_mask32_impl_c(mask, bit);
+}
+
+static really_inline
+u32 rank_in_mask64_impl(u64a mask, u32 bit) {
+    return rank_in_mask64_impl_c(mask, bit);
+}
+
+static really_inline
+u32 pext32_impl(u32 x, u32 mask) {
+#if defined(HAVE_BMI2)
+    // Intel BMI2 can do this operation in one instruction.
+    return _pext_u32(x, mask);
+#else
+    return pext32_impl_c(x, mask);
+#endif
+}
+
+static really_inline
+u64a pext64_impl(u64a x, u64a mask) {
+#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
+    // Intel BMI2 can do this operation in one instruction.
+    return _pext_u64(x, mask);
+#else
+    return pext64_impl_c(x, mask);
+#endif
+}
+
+#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
+static really_inline
+u64a pdep64(u64a x, u64a mask) {
+    return _pdep_u64(x, mask);
+}
+#endif
+
+#endif // BITUTILS_ARCH_X86_H
diff --git a/src/util/bitutils.h b/src/util/bitutils.h
index c545ee18..651e5f93 100644
--- a/src/util/bitutils.h
+++ b/src/util/bitutils.h
@@ -33,6 +33,7 @@
 #ifndef BITUTILS_H
 #define BITUTILS_H
 
+#include "config.h"
 #include "ue2common.h"
 #include "popcount.h"
 #include "util/arch.h"
@@ -43,351 +44,88 @@
 #define DOUBLE_CASE_CLEAR 0xdfdf
 #define OCTO_CASE_CLEAR   0xdfdfdfdfdfdfdfdfULL
 
+
+#if defined(_WIN32) || defined(_WIN64) || defined(ARCH_IA32) || defined(ARCH_X86_64)
+#include "util/arch/x86/bitutils.h"
+#endif
+
 static really_inline
 u32 clz32(u32 x) {
     assert(x); // behaviour not defined for x == 0
-#if defined(_WIN32)
-    unsigned long r;
-    _BitScanReverse(&r, x);
-    return 31 - r;
-#else
-    return (u32)__builtin_clz(x);
-#endif
+
+    return clz32_impl(x);
 }
 
 static really_inline
 u32 clz64(u64a x) {
     assert(x); // behaviour not defined for x == 0
-#if defined(_WIN64)
-    unsigned long r;
-    _BitScanReverse64(&r, x);
-    return 63 - r;
-#elif defined(_WIN32)
-    unsigned long x1 = (u32)x;
-    unsigned long x2 = (u32)(x >> 32);
-    unsigned long r;
-    if (x2) {
-        _BitScanReverse(&r, x2);
-        return (u32)(31 - r);
-    }
-    _BitScanReverse(&r, (u32)x1);
-    return (u32)(63 - r);
-#else
-    return (u32)__builtin_clzll(x);
-#endif
+
+    return clz64_impl(x);
 }
 
 // CTZ (count trailing zero) implementations.
 static really_inline
 u32 ctz32(u32 x) {
     assert(x); // behaviour not defined for x == 0
-#if defined(_WIN32)
-    unsigned long r;
-    _BitScanForward(&r, x);
-    return r;
-#else
-    return (u32)__builtin_ctz(x);
-#endif
+
+    return ctz32_impl(x);
 }
 
 static really_inline
 u32 ctz64(u64a x) {
     assert(x); // behaviour not defined for x == 0
-#if defined(_WIN64)
-    unsigned long r;
-    _BitScanForward64(&r, x);
-    return r;
-#elif defined(_WIN32)
-    unsigned long r;
-    if (_BitScanForward(&r, (u32)x)) {
-        return (u32)r;
-    }
-    _BitScanForward(&r, x >> 32);
-    return (u32)(r + 32);
-#else
-    return (u32)__builtin_ctzll(x);
-#endif
+
+    return ctz64_impl(x);
 }
 
 static really_inline
 u32 lg2(u32 x) {
-    if (!x) {
-        return 0;
-    }
-    return 31 - clz32(x);
+    return lg2_impl(x);
 }
 
 static really_inline
 u64a lg2_64(u64a x) {
-    if (!x) {
-        return 0;
-    }
-    return 63 - clz64(x);
+    return lg2_64_impl(x);
 }
 
 static really_inline
 u32 findAndClearLSB_32(u32 *v) {
-    assert(*v != 0); // behaviour not defined in this case
-#ifndef NO_ASM
-    u32 val = *v, offset;
-    __asm__ ("bsf %1, %0\n"
-             "btr %0, %1\n"
-             : "=r" (offset), "=r" (val)
-             : "1" (val));
-    *v = val;
-#else
-    u32 val = *v;
-    u32 offset = ctz32(val);
-    *v = val & (val - 1);
-#endif
-
-    assert(offset < 32);
-    return offset;
+    return findAndClearLSB_32_impl(v);
 }
 
 static really_inline
 u32 findAndClearLSB_64(u64a *v) {
-    assert(*v != 0); // behaviour not defined in this case
-
-#ifdef ARCH_64_BIT
-#if defined(ARCH_X86_64) && !defined(NO_ASM)
-    u64a val = *v, offset;
-    __asm__ ("bsfq %1, %0\n"
-             "btrq %0, %1\n"
-             : "=r" (offset), "=r" (val)
-             : "1" (val));
-    *v = val;
-#else
-    // generic variant using gcc's builtin on 64-bit
-    u64a val = *v, offset;
-    offset = ctz64(val);
-    *v = val & (val - 1);
-#endif // ARCH_X86_64
-#else
-    // fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't
-    // inline calls to __builtin_ctzll
-    u32 v1 = (u32)*v;
-    u32 v2 = (u32)(*v >> 32);
-    u32 offset;
-    if (v1) {
-        offset = findAndClearLSB_32(&v1);
-        *v = (u64a)v1 | ((u64a)v2 << 32);
-    } else {
-        offset = findAndClearLSB_32(&v2) + 32;
-        *v = (u64a)v2 << 32;
-    }
-#endif
-
-    assert(offset < 64);
-    return (u32)offset;
+    return findAndClearLSB_64_impl(v);
 }
 
 static really_inline
 u32 findAndClearMSB_32(u32 *v) {
-    assert(*v != 0); // behaviour not defined in this case
-#ifndef NO_ASM
-    u32 val = *v, offset;
-    __asm__ ("bsr %1, %0\n"
-             "btr %0, %1\n"
-             : "=r" (offset), "=r" (val)
-             : "1" (val));
-    *v = val;
-#else
-    u32 val = *v;
-    u32 offset = 31 - clz32(val);
-    *v = val & ~(1 << offset);
-#endif
-    assert(offset < 32);
-    return offset;
+    return findAndClearMSB_32_impl(v);
 }
 
 static really_inline
 u32 findAndClearMSB_64(u64a *v) {
-    assert(*v != 0); // behaviour not defined in this case
-
-#ifdef ARCH_64_BIT
-#if defined(ARCH_X86_64) && !defined(NO_ASM)
-    u64a val = *v, offset;
-    __asm__ ("bsrq %1, %0\n"
-             "btrq %0, %1\n"
-             : "=r" (offset), "=r" (val)
-             : "1" (val));
-    *v = val;
-#else
-    // generic variant using gcc's builtin on 64-bit
-    u64a val = *v, offset;
-    offset = 63 - clz64(val);
-    *v = val & ~(1ULL << offset);
-#endif // ARCH_X86_64
-#else
-    // fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't
-    // inline calls to __builtin_ctzll
-    u32 v1 = (u32)*v;
-    u32 v2 = (*v >> 32);
-    u32 offset;
-    if (v2) {
-        offset = findAndClearMSB_32(&v2) + 32;
-        *v = ((u64a)v2 << 32) | (u64a)v1;
-    } else {
-        offset = findAndClearMSB_32(&v1);
-        *v = (u64a)v1;
-    }
-#endif
-
-    assert(offset < 64);
-    return (u32)offset;
+    return findAndClearMSB_64_impl(v);
 }
 
 static really_inline
 u32 compress32(u32 x, u32 m) {
-#if defined(HAVE_BMI2)
-    // BMI2 has a single instruction for this operation.
-    return _pext_u32(x, m);
-#else
-
-    // Return zero quickly on trivial cases
-    if ((x & m) == 0) {
-        return 0;
-    }
-
-    u32 mk, mp, mv, t;
-
-    x &= m; // clear irrelevant bits
-
-    mk = ~m << 1; // we will count 0's to right
-    for (u32 i = 0; i < 5; i++) {
-        mp = mk ^ (mk << 1);
-        mp ^= mp << 2;
-        mp ^= mp << 4;
-        mp ^= mp << 8;
-        mp ^= mp << 16;
-
-        mv = mp & m; // bits to move
-        m = (m ^ mv) | (mv >> (1 << i)); // compress m
-        t = x & mv;
-        x = (x ^ t) | (t >> (1 << i)); // compress x
-        mk = mk & ~mp;
-    }
-
-    return x;
-#endif
+    return compress32_impl(x, m);
 }
 
 static really_inline
 u64a compress64(u64a x, u64a m) {
-#if defined(ARCH_X86_64) && defined(HAVE_BMI2)
-    // BMI2 has a single instruction for this operation.
-    return _pext_u64(x, m);
-#else
-
-    // Return zero quickly on trivial cases
-    if ((x & m) == 0) {
-        return 0;
-    }
-
-    u64a mk, mp, mv, t;
-
-    x &= m; // clear irrelevant bits
-
-    mk = ~m << 1; // we will count 0's to right
-    for (u32 i = 0; i < 6; i++) {
-        mp = mk ^ (mk << 1);
-        mp ^= mp << 2;
-        mp ^= mp << 4;
-        mp ^= mp << 8;
-        mp ^= mp << 16;
-        mp ^= mp << 32;
-
-        mv = mp & m; // bits to move
-        m = (m ^ mv) | (mv >> (1 << i)); // compress m
-        t = x & mv;
-        x = (x ^ t) | (t >> (1 << i)); // compress x
-        mk = mk & ~mp;
-    }
-
-    return x;
-#endif
+    return compress64_impl(x, m);
 }
 
 static really_inline
 u32 expand32(u32 x, u32 m) {
-#if defined(HAVE_BMI2)
-    // BMI2 has a single instruction for this operation.
-    return _pdep_u32(x, m);
-#else
-
-    // Return zero quickly on trivial cases
-    if (!x || !m) {
-        return 0;
-    }
-
-    u32 m0, mk, mp, mv, t;
-    u32 array[5];
-
-    m0 = m; // save original mask
-    mk = ~m << 1; // we will count 0's to right
-
-    for (int i = 0; i < 5; i++) {
-        mp = mk ^ (mk << 1); // parallel suffix
-        mp = mp ^ (mp << 2);
-        mp = mp ^ (mp << 4);
-        mp = mp ^ (mp << 8);
-        mp = mp ^ (mp << 16);
-        mv = mp & m; // bits to move
-        array[i] = mv;
-        m = (m ^ mv) | (mv >> (1 << i)); // compress m
-        mk = mk & ~mp;
-    }
-
-    for (int i = 4; i >= 0; i--) {
-        mv = array[i];
-        t = x << (1 << i);
-        x = (x & ~mv) | (t & mv);
-    }
-
-    return x & m0; // clear out extraneous bits
-#endif
+    return expand32_impl(x, m);
 }
 
 static really_inline
 u64a expand64(u64a x, u64a m) {
-#if defined(ARCH_X86_64) && defined(HAVE_BMI2)
-    // BMI2 has a single instruction for this operation.
-    return _pdep_u64(x, m);
-#else
-
-    // Return zero quickly on trivial cases
-    if (!x || !m) {
-        return 0;
-    }
-
-    u64a m0, mk, mp, mv, t;
-    u64a array[6];
-
-    m0 = m; // save original mask
-    mk = ~m << 1; // we will count 0's to right
-
-    for (int i = 0; i < 6; i++) {
-        mp = mk ^ (mk << 1); // parallel suffix
-        mp = mp ^ (mp << 2);
-        mp = mp ^ (mp << 4);
-        mp = mp ^ (mp << 8);
-        mp = mp ^ (mp << 16);
-        mp = mp ^ (mp << 32);
-        mv = mp & m; // bits to move
-        array[i] = mv;
-        m = (m ^ mv) | (mv >> (1 << i)); // compress m
-        mk = mk & ~mp;
-    }
-
-    for (int i = 5; i >= 0; i--) {
-        mv = array[i];
-        t = x << (1 << i);
-        x = (x & ~mv) | (t & mv);
-    }
-
-    return x & m0; // clear out extraneous bits
-#endif
+    return expand64_impl(x, m);
 }
 
 
@@ -396,97 +134,37 @@ u64a expand64(u64a x, u64a m) {
  */
 static really_inline
 u32 bf64_iterate(u64a bitfield, u32 begin) {
-    if (begin != ~0U) {
-        /* switch off all bits at or below begin. Note: not legal to shift by
-         * by size of the datatype or larger. */
-        assert(begin <= 63);
-        bitfield &= ~((2ULL << begin) - 1);
-    }
-
-    if (!bitfield) {
-        return ~0U;
-    }
-
-    return ctz64(bitfield);
+    return bf64_iterate_impl(bitfield, begin);
 }
 
 static really_inline
 char bf64_set(u64a *bitfield, u32 i) {
-    assert(i < 64);
-    u64a mask = 1ULL << i;
-    char was_set = !!(*bitfield & mask);
-    *bitfield |= mask;
-
-    return was_set;
+    return bf64_set_impl(bitfield, i);
 }
 
 static really_inline
 void bf64_unset(u64a *bitfield, u32 i) {
-    assert(i < 64);
-    *bitfield &= ~(1ULL << i);
+    return bf64_unset_impl(bitfield, i);
 }
 
 static really_inline
 u32 rank_in_mask32(u32 mask, u32 bit) {
-    assert(bit < sizeof(u32) * 8);
-    assert(mask & (u32)(1U << bit));
-    mask &= (u32)(1U << bit) - 1;
-    return popcount32(mask);
+    return rank_in_mask32_impl(mask, bit);
 }
 
 static really_inline
 u32 rank_in_mask64(u64a mask, u32 bit) {
-    assert(bit < sizeof(u64a) * 8);
-    assert(mask & (u64a)(1ULL << bit));
-    mask &= (u64a)(1ULL << bit) - 1;
-    return popcount64(mask);
+    return rank_in_mask64_impl(mask, bit);
 }
 
 static really_inline
 u32 pext32(u32 x, u32 mask) {
-#if defined(HAVE_BMI2)
-    // Intel BMI2 can do this operation in one instruction.
-    return _pext_u32(x, mask);
-#else
-
-    u32 result = 0, num = 1;
-    while (mask != 0) {
-        u32 bit = findAndClearLSB_32(&mask);
-        if (x & (1U << bit)) {
-            assert(num != 0); // more than 32 bits!
-            result |= num;
-        }
-        num <<= 1;
-    }
-    return result;
-#endif
+    return pext32_impl(x, mask);
 }
 
 static really_inline
 u64a pext64(u64a x, u64a mask) {
-#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
-    // Intel BMI2 can do this operation in one instruction.
-    return _pext_u64(x, mask);
-#else
-
-    u32 result = 0, num = 1;
-    while (mask != 0) {
-        u32 bit = findAndClearLSB_64(&mask);
-        if (x & (1ULL << bit)) {
-            assert(num != 0); // more than 32 bits!
-            result |= num;
-        }
-        num <<= 1;
-    }
-    return result;
-#endif
+    return pext64_impl(x, mask);
 }
 
-#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
-static really_inline
-u64a pdep64(u64a x, u64a mask) {
-    return _pdep_u64(x, mask);
-}
-#endif
-
 #endif // BITUTILS_H

From 6581aae90e55520353c03edb716de80ecc03521a Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <markos@freevec.org>
Date: Tue, 22 Sep 2020 11:45:24 +0300
Subject: [PATCH 07/53] move x86 popcount.h implementations to
 util/arch/x86/popcount.h

---
 src/util/arch/common/popcount.h | 60 +++++++++++++++++++++++++++++
 src/util/arch/x86/popcount.h    | 67 +++++++++++++++++++++++++++++++++
 src/util/popcount.h             | 35 ++++-------------
 3 files changed, 135 insertions(+), 27 deletions(-)
 create mode 100644 src/util/arch/common/popcount.h
 create mode 100644 src/util/arch/x86/popcount.h

diff --git a/src/util/arch/common/popcount.h b/src/util/arch/common/popcount.h
new file mode 100644
index 00000000..0bd1e837
--- /dev/null
+++ b/src/util/arch/common/popcount.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Platform specific popcount functions
+ */
+
+#ifndef POPCOUNT_ARCH_COMMON_H
+#define POPCOUNT_ARCH_COMMON_H
+
+static really_inline
+u32 popcount32_impl_c(u32 x) {
+    // Fast branch-free version from bit-twiddling hacks as older Intel
+    // processors do not have a POPCNT instruction.
+    x -= (x >> 1) & 0x55555555;
+    x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
+    return (((x + (x >> 4)) & 0xf0f0f0f) * 0x1010101) >> 24;
+}
+
+static really_inline
+u32 popcount64_impl_c(u64a x) {
+#if defined(ARCH_64_BIT)
+    // Fast branch-free version from bit-twiddling hacks as older Intel
+    // processors do not have a POPCNT instruction.
+    x -= (x >> 1) & 0x5555555555555555;
+    x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333);
+    x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f;
+    return (x * 0x0101010101010101) >> 56;
+#else
+    // Synthesise from two 32-bit cases.
+    return popcount32_impl(x >> 32) + popcount32_impl(x);
+#endif
+}
+
+#endif // POPCOUNT_ARCH_COMMON_H
\ No newline at end of file
diff --git a/src/util/arch/x86/popcount.h b/src/util/arch/x86/popcount.h
new file mode 100644
index 00000000..86929ede
--- /dev/null
+++ b/src/util/arch/x86/popcount.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Platform specific popcount functions
+ */
+
+#ifndef POPCOUNT_ARCH_X86_H
+#define POPCOUNT_ARCH_X86_H
+
+#include "ue2common.h"
+#include "util/arch.h"
+#include "util/intrinsics.h"
+
+#include "util/arch/common/popcount.h"
+
+static really_inline
+u32 popcount32_impl(u32 x) {
+#if defined(HAVE_POPCOUNT_INSTR)
+    // Single-instruction builtin.
+    return _mm_popcnt_u32(x);
+#else
+    return popcount32_impl_c(x);
+#endif
+}
+
+static really_inline
+u32 popcount64_impl(u64a x) {
+#if defined(ARCH_X86_64)
+# if defined(HAVE_POPCOUNT_INSTR)
+    // Single-instruction builtin.
+    return (u32)_mm_popcnt_u64(x);
+# else
+    return popcount64_impl_c(x);
+# endif
+#else
+    // Synthesise from two 32-bit cases.
+    return popcount32_impl(x >> 32) + popcount32_impl(x);
+#endif
+}
+
+#endif // POPCOUNT_ARCH_X86_h
\ No newline at end of file
diff --git a/src/util/popcount.h b/src/util/popcount.h
index eb08f6b1..932fc2cf 100644
--- a/src/util/popcount.h
+++ b/src/util/popcount.h
@@ -33,41 +33,22 @@
 #ifndef UTIL_POPCOUNT_H_
 #define UTIL_POPCOUNT_H_
 
+#include "config.h"
 #include "ue2common.h"
 #include "util/arch.h"
 
+#if defined(ARCH_IA32) || defined(ARCH_X86_64)
+#include "util/arch/x86/popcount.h"
+#endif
+
 static really_inline
 u32 popcount32(u32 x) {
-#if defined(HAVE_POPCOUNT_INSTR)
-    // Single-instruction builtin.
-    return _mm_popcnt_u32(x);
-#else
-    // Fast branch-free version from bit-twiddling hacks as older Intel
-    // processors do not have a POPCNT instruction.
-    x -= (x >> 1) & 0x55555555;
-    x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
-    return (((x + (x >> 4)) & 0xf0f0f0f) * 0x1010101) >> 24;
-#endif
+    return popcount32_impl(x);
 }
 
 static really_inline
-u32 popcount64(u64a x) {
-#if defined(ARCH_X86_64)
-# if defined(HAVE_POPCOUNT_INSTR)
-    // Single-instruction builtin.
-    return (u32)_mm_popcnt_u64(x);
-# else
-    // Fast branch-free version from bit-twiddling hacks as older Intel
-    // processors do not have a POPCNT instruction.
-    x -= (x >> 1) & 0x5555555555555555;
-    x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333);
-    x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f;
-    return (x * 0x0101010101010101) >> 56;
-# endif
-#else
-    // Synthesise from two 32-bit cases.
-    return popcount32(x >> 32) + popcount32(x);
-#endif
+u32 popcount64(u32 x) {
+    return popcount64_impl(x);
 }
 
 #endif /* UTIL_POPCOUNT_H_ */

From 9f3ad89ed63dc56f8fe84b88a5ed81a7c5c6b11b Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <markos@freevec.org>
Date: Tue, 22 Sep 2020 12:17:27 +0300
Subject: [PATCH 08/53] move andn helper function to bitutils.h

---
 src/fdr/fdr.c                   | 15 +--------------
 src/util/arch/common/bitutils.h |  9 +++++++++
 src/util/arch/x86/bitutils.h    | 14 ++++++++++++++
 src/util/bitutils.h             |  8 ++++++++
 4 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c
index d33756d3..b0f90b52 100644
--- a/src/fdr/fdr.c
+++ b/src/fdr/fdr.c
@@ -36,6 +36,7 @@
 #include "teddy.h"
 #include "teddy_internal.h"
 #include "util/arch.h"
+#include "util/bitutils.h"
 #include "util/simd_utils.h"
 #include "util/uniform_ops.h"
 
@@ -119,20 +120,6 @@ const ALIGN_CL_DIRECTIVE u8 zone_or_mask[ITER_BYTES+1][ITER_BYTES] = {
       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }
 };
 
-/* compilers don't reliably synthesize the 32-bit ANDN instruction here,
- * so we force its generation.
- */
-static really_inline
-u64a andn(const u32 a, const u8 *b) {
-    u64a r;
-#if defined(HAVE_BMI) && !defined(NO_ASM)
-    __asm__ ("andn\t%2,%1,%k0" : "=r"(r) : "r"(a), "m"(*(const u32 *)b));
-#else
-    r = unaligned_load_u32(b) & ~a;
-#endif
-    return r;
-}
-
 /* generates an initial state mask based on the last byte-ish of history rather
  * than being all accepting. If there is no history to consider, the state is
  * generated based on the minimum length of each bucket in order to prevent
diff --git a/src/util/arch/common/bitutils.h b/src/util/arch/common/bitutils.h
index 85d5dc49..f2706d70 100644
--- a/src/util/arch/common/bitutils.h
+++ b/src/util/arch/common/bitutils.h
@@ -34,6 +34,7 @@
 #define BITUTILS_ARCH_COMMON_H
 
 #include "util/popcount.h"
+#include "util/unaligned.h"
 
 static really_inline
 u32 clz32_impl_c(u32 x) {
@@ -350,4 +351,12 @@ u64a pext64_impl_c(u64a x, u64a mask) {
     return result;
 }
 
+/* compilers don't reliably synthesize the 32-bit ANDN instruction here,
+ * so we force its generation.
+ */
+static really_inline
+u64a andn_impl_c(const u32 a, const u8 *b) {
+    return unaligned_load_u32(b) & ~a;
+}
+
 #endif // BITUTILS_ARCH_COMMON_H
diff --git a/src/util/arch/x86/bitutils.h b/src/util/arch/x86/bitutils.h
index da7c747e..ec4c95ad 100644
--- a/src/util/arch/x86/bitutils.h
+++ b/src/util/arch/x86/bitutils.h
@@ -301,4 +301,18 @@ u64a pdep64(u64a x, u64a mask) {
 }
 #endif
 
+/* compilers don't reliably synthesize the 32-bit ANDN instruction here,
+ * so we force its generation.
+ */
+static really_inline
+u64a andn_impl(const u32 a, const u8 *b) {
+#if defined(HAVE_BMI) && !defined(NO_ASM)
+    u64a r;
+    __asm__ ("andn\t%2,%1,%k0" : "=r"(r) : "r"(a), "m"(*(const u32 *)b));
+    return r;
+#else
+    return andn_impl_c(a, b);
+#endif
+}
+
 #endif // BITUTILS_ARCH_X86_H
diff --git a/src/util/bitutils.h b/src/util/bitutils.h
index 651e5f93..b9f312cb 100644
--- a/src/util/bitutils.h
+++ b/src/util/bitutils.h
@@ -167,4 +167,12 @@ u64a pext64(u64a x, u64a mask) {
     return pext64_impl(x, mask);
 }
 
+/* compilers don't reliably synthesize the 32-bit ANDN instruction here,
+ * so we force its generation.
+ */
+static really_inline
+u64a andn(const u32 a, const u8 *b) {
+    return andn_impl_c(a, b);
+}
+
 #endif // BITUTILS_H

From e915d848640baba904ada9a576eed00361d2e06b Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <markos@freevec.org>
Date: Tue, 22 Sep 2020 13:10:52 +0300
Subject: [PATCH 09/53] no need to check for WIN32*

---
 src/util/bitutils.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/util/bitutils.h b/src/util/bitutils.h
index b9f312cb..7373a9c8 100644
--- a/src/util/bitutils.h
+++ b/src/util/bitutils.h
@@ -45,7 +45,7 @@
 #define OCTO_CASE_CLEAR   0xdfdfdfdfdfdfdfdfULL
 
 
-#if defined(_WIN32) || defined(_WIN64) || defined(ARCH_IA32) || defined(ARCH_X86_64)
+#if defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "util/arch/x86/bitutils.h"
 #endif
 

From e8e188acaf450a86ff6e7c3f611815bb67710732 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <markos@freevec.org>
Date: Tue, 22 Sep 2020 13:12:07 +0300
Subject: [PATCH 10/53] move x86 implementations of simd_utils.h to
 util/arch/x86/

---
 src/util/arch/x86/simd_utils.h | 1312 ++++++++++++++++++++++++++++++++
 src/util/simd_utils.h          | 1281 +------------------------------
 2 files changed, 1317 insertions(+), 1276 deletions(-)
 create mode 100644 src/util/arch/x86/simd_utils.h

diff --git a/src/util/arch/x86/simd_utils.h b/src/util/arch/x86/simd_utils.h
new file mode 100644
index 00000000..6ec4042b
--- /dev/null
+++ b/src/util/arch/x86/simd_utils.h
@@ -0,0 +1,1312 @@
+/*
+ * Copyright (c) 2015-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief SIMD types and primitive operations.
+ */
+
+#ifndef ARCH_X86_SIMD_UTILS_H
+#define ARCH_X86_SIMD_UTILS_H
+
+#if !defined(_WIN32) && !defined(__SSSE3__)
+#error SSSE3 instructions must be enabled
+#endif
+
+#include "ue2common.h"
+#include "util/simd_types.h"
+#include "util/unaligned.h"
+#include "util/intrinsics.h"
+
+#include <string.h> // for memcpy
+
+static really_inline m128 ones128(void) {
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+    /* gcc gets this right */
+    return _mm_set1_epi8(0xFF);
+#else
+    /* trick from Intel's optimization guide to generate all-ones.
+     * ICC converts this to the single cmpeq instruction */
+    return _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128());
+#endif
+}
+
+static really_inline m128 zeroes128(void) {
+    return _mm_setzero_si128();
+}
+
+/** \brief Bitwise not for m128*/
+static really_inline m128 not128(m128 a) {
+    return _mm_xor_si128(a, ones128());
+}
+
+/** \brief Return 1 if a and b are different otherwise 0 */
+static really_inline int diff128(m128 a, m128 b) {
+    return (_mm_movemask_epi8(_mm_cmpeq_epi8(a, b)) ^ 0xffff);
+}
+
+static really_inline int isnonzero128(m128 a) {
+    return !!diff128(a, zeroes128());
+}
+
+/**
+ * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit
+ * mask indicating which 32-bit words contain differences.
+ */
+static really_inline u32 diffrich128(m128 a, m128 b) {
+    a = _mm_cmpeq_epi32(a, b);
+    return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0xf;
+}
+
+/**
+ * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and
+ * returns a 4-bit mask indicating which 64-bit words contain differences.
+ */
+static really_inline u32 diffrich64_128(m128 a, m128 b) {
+#if defined(HAVE_SSE41)
+    a = _mm_cmpeq_epi64(a, b);
+    return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0x5;
+#else
+    u32 d = diffrich128(a, b);
+    return (d | (d >> 1)) & 0x5;
+#endif
+}
+
+static really_really_inline
+m128 lshift64_m128(m128 a, unsigned b) {
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(b)) {
+        return _mm_slli_epi64(a, b);
+    }
+#endif
+    m128 x = _mm_cvtsi32_si128(b);
+    return _mm_sll_epi64(a, x);
+}
+
+#define rshift64_m128(a, b) _mm_srli_epi64((a), (b))
+#define eq128(a, b)      _mm_cmpeq_epi8((a), (b))
+#define movemask128(a)  ((u32)_mm_movemask_epi8((a)))
+
+static really_inline m128 set16x8(u8 c) {
+    return _mm_set1_epi8(c);
+}
+
+static really_inline m128 set4x32(u32 c) {
+    return _mm_set1_epi32(c);
+}
+
+static really_inline u32 movd(const m128 in) {
+    return _mm_cvtsi128_si32(in);
+}
+
+#if defined(HAVE_AVX512)
+static really_inline u32 movd512(const m512 in) {
+    // NOTE: seems gcc doesn't support _mm512_cvtsi512_si32(in),
+    //       so we use 2-step convertions to work around.
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(in));
+}
+#endif
+
+static really_inline u64a movq(const m128 in) {
+#if defined(ARCH_X86_64)
+    return _mm_cvtsi128_si64(in);
+#else // 32-bit - this is horrific
+    u32 lo = movd(in);
+    u32 hi = movd(_mm_srli_epi64(in, 32));
+    return (u64a)hi << 32 | lo;
+#endif
+}
+
+/* another form of movq */
+static really_inline
+m128 load_m128_from_u64a(const u64a *p) {
+    return _mm_set_epi64x(0LL, *p);
+}
+
+#define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed)
+#define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed)
+
+#if defined(HAVE_SSE41)
+#define extract32from128(a, imm) _mm_extract_epi32(a, imm)
+#define extract64from128(a, imm) _mm_extract_epi64(a, imm)
+#else
+#define extract32from128(a, imm) movd(_mm_srli_si128(a, imm << 2))
+#define extract64from128(a, imm) movq(_mm_srli_si128(a, imm << 3))
+#endif
+
+#if !defined(HAVE_AVX2)
+// TODO: this entire file needs restructuring - this carveout is awful
+#define extractlow64from256(a) movq(a.lo)
+#define extractlow32from256(a) movd(a.lo)
+#if defined(HAVE_SSE41)
+#define extract32from256(a, imm) _mm_extract_epi32((imm >> 2) ? a.hi : a.lo, imm % 4)
+#define extract64from256(a, imm) _mm_extract_epi64((imm >> 1) ? a.hi : a.lo, imm % 2)
+#else
+#define extract32from256(a, imm) movd(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 4))
+#define extract64from256(a, imm) movq(_mm_srli_si128((imm >> 1) ? a.hi : a.lo, (imm % 2) * 8))
+#endif
+
+#endif // !AVX2
+
+static really_inline m128 and128(m128 a, m128 b) {
+    return _mm_and_si128(a,b);
+}
+
+static really_inline m128 xor128(m128 a, m128 b) {
+    return _mm_xor_si128(a,b);
+}
+
+static really_inline m128 or128(m128 a, m128 b) {
+    return _mm_or_si128(a,b);
+}
+
+static really_inline m128 andnot128(m128 a, m128 b) {
+    return _mm_andnot_si128(a, b);
+}
+
+// aligned load
+static really_inline m128 load128(const void *ptr) {
+    assert(ISALIGNED_N(ptr, alignof(m128)));
+    ptr = assume_aligned(ptr, 16);
+    return _mm_load_si128((const m128 *)ptr);
+}
+
+// aligned store
+static really_inline void store128(void *ptr, m128 a) {
+    assert(ISALIGNED_N(ptr, alignof(m128)));
+    ptr = assume_aligned(ptr, 16);
+    *(m128 *)ptr = a;
+}
+
+// unaligned load
+static really_inline m128 loadu128(const void *ptr) {
+    return _mm_loadu_si128((const m128 *)ptr);
+}
+
+// unaligned store
+static really_inline void storeu128(void *ptr, m128 a) {
+    _mm_storeu_si128 ((m128 *)ptr, a);
+}
+
+// packed unaligned store of first N bytes
+static really_inline
+void storebytes128(void *ptr, m128 a, unsigned int n) {
+    assert(n <= sizeof(a));
+    memcpy(ptr, &a, n);
+}
+
+// packed unaligned load of first N bytes, pad with zero
+static really_inline
+m128 loadbytes128(const void *ptr, unsigned int n) {
+    m128 a = zeroes128();
+    assert(n <= sizeof(a));
+    memcpy(&a, ptr, n);
+    return a;
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern const u8 simd_onebit_masks[];
+#ifdef __cplusplus
+}
+#endif
+
+static really_inline
+m128 mask1bit128(unsigned int n) {
+    assert(n < sizeof(m128) * 8);
+    u32 mask_idx = ((n % 8) * 64) + 95;
+    mask_idx -= n / 8;
+    return loadu128(&simd_onebit_masks[mask_idx]);
+}
+
+// switches on bit N in the given vector.
+static really_inline
+void setbit128(m128 *ptr, unsigned int n) {
+    *ptr = or128(mask1bit128(n), *ptr);
+}
+
+// switches off bit N in the given vector.
+static really_inline
+void clearbit128(m128 *ptr, unsigned int n) {
+    *ptr = andnot128(mask1bit128(n), *ptr);
+}
+
+// tests bit N in the given vector.
+static really_inline
+char testbit128(m128 val, unsigned int n) {
+    const m128 mask = mask1bit128(n);
+#if defined(HAVE_SSE41)
+    return !_mm_testz_si128(mask, val);
+#else
+    return isnonzero128(and128(mask, val));
+#endif
+}
+
+// offset must be an immediate
+#define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset)
+
+static really_inline
+m128 pshufb_m128(m128 a, m128 b) {
+    m128 result;
+    result = _mm_shuffle_epi8(a, b);
+    return result;
+}
+
+static really_inline
+m256 pshufb_m256(m256 a, m256 b) {
+#if defined(HAVE_AVX2)
+    return _mm256_shuffle_epi8(a, b);
+#else
+    m256 rv;
+    rv.lo = pshufb_m128(a.lo, b.lo);
+    rv.hi = pshufb_m128(a.hi, b.hi);
+    return rv;
+#endif
+}
+
+#if defined(HAVE_AVX512)
+static really_inline
+m512 pshufb_m512(m512 a, m512 b) {
+    return _mm512_shuffle_epi8(a, b);
+}
+
+static really_inline
+m512 maskz_pshufb_m512(__mmask64 k, m512 a, m512 b) {
+    return _mm512_maskz_shuffle_epi8(k, a, b);
+}
+
+#if defined(HAVE_AVX512VBMI)
+#define vpermb512(idx, a) _mm512_permutexvar_epi8(idx, a)
+#define maskz_vpermb512(k, idx, a) _mm512_maskz_permutexvar_epi8(k, idx, a)
+#endif
+
+#endif
+
+static really_inline
+m128 variable_byte_shift_m128(m128 in, s32 amount) {
+    assert(amount >= -16 && amount <= 16);
+    m128 shift_mask = loadu128(vbs_mask_data + 16 - amount);
+    return pshufb_m128(in, shift_mask);
+}
+
+static really_inline
+m128 max_u8_m128(m128 a, m128 b) {
+    return _mm_max_epu8(a, b);
+}
+
+static really_inline
+m128 min_u8_m128(m128 a, m128 b) {
+    return _mm_min_epu8(a, b);
+}
+
+static really_inline
+m128 sadd_u8_m128(m128 a, m128 b) {
+    return _mm_adds_epu8(a, b);
+}
+
+static really_inline
+m128 sub_u8_m128(m128 a, m128 b) {
+    return _mm_sub_epi8(a, b);
+}
+
+static really_inline
+m128 set64x2(u64a hi, u64a lo) {
+    return _mm_set_epi64x(hi, lo);
+}
+
+/****
+ **** 256-bit Primitives
+ ****/
+
+#if defined(HAVE_AVX2)
+
+static really_really_inline
+m256 lshift64_m256(m256 a, unsigned b) {
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(b)) {
+        return _mm256_slli_epi64(a, b);
+    }
+#endif
+    m128 x = _mm_cvtsi32_si128(b);
+    return _mm256_sll_epi64(a, x);
+}
+
+#define rshift64_m256(a, b) _mm256_srli_epi64((a), (b))
+
+static really_inline
+m256 set32x8(u32 in) {
+    return _mm256_set1_epi8(in);
+}
+
+#define eq256(a, b)     _mm256_cmpeq_epi8((a), (b))
+#define movemask256(a)  ((u32)_mm256_movemask_epi8((a)))
+
+static really_inline
+m256 set2x128(m128 a) {
+    return _mm256_broadcastsi128_si256(a);
+}
+
+#else
+
+static really_really_inline
+m256 lshift64_m256(m256 a, int b) {
+    m256 rv = a;
+    rv.lo = lshift64_m128(rv.lo, b);
+    rv.hi = lshift64_m128(rv.hi, b);
+    return rv;
+}
+
+static really_inline
+m256 rshift64_m256(m256 a, int b) {
+    m256 rv = a;
+    rv.lo = rshift64_m128(rv.lo, b);
+    rv.hi = rshift64_m128(rv.hi, b);
+    return rv;
+}
+static really_inline
+m256 set32x8(u32 in) {
+    m256 rv;
+    rv.lo = set16x8((u8) in);
+    rv.hi = rv.lo;
+    return rv;
+}
+
+static really_inline
+m256 eq256(m256 a, m256 b) {
+    m256 rv;
+    rv.lo = eq128(a.lo, b.lo);
+    rv.hi = eq128(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline
+u32 movemask256(m256 a) {
+    u32 lo_mask = movemask128(a.lo);
+    u32 hi_mask = movemask128(a.hi);
+    return lo_mask | (hi_mask << 16);
+}
+
+static really_inline
+m256 set2x128(m128 a) {
+    m256 rv = {a, a};
+    return rv;
+}
+#endif
+
+static really_inline m256 zeroes256(void) {
+#if defined(HAVE_AVX2)
+    return _mm256_setzero_si256();
+#else
+    m256 rv = {zeroes128(), zeroes128()};
+    return rv;
+#endif
+}
+
+static really_inline m256 ones256(void) {
+#if defined(HAVE_AVX2)
+    m256 rv = _mm256_set1_epi8(0xFF);
+#else
+    m256 rv = {ones128(), ones128()};
+#endif
+    return rv;
+}
+
+#if defined(HAVE_AVX2)
+static really_inline m256 and256(m256 a, m256 b) {
+    return _mm256_and_si256(a, b);
+}
+#else
+static really_inline m256 and256(m256 a, m256 b) {
+    m256 rv;
+    rv.lo = and128(a.lo, b.lo);
+    rv.hi = and128(a.hi, b.hi);
+    return rv;
+}
+#endif
+
+#if defined(HAVE_AVX2)
+static really_inline m256 or256(m256 a, m256 b) {
+    return _mm256_or_si256(a, b);
+}
+#else
+static really_inline m256 or256(m256 a, m256 b) {
+    m256 rv;
+    rv.lo = or128(a.lo, b.lo);
+    rv.hi = or128(a.hi, b.hi);
+    return rv;
+}
+#endif
+
+#if defined(HAVE_AVX2)
+static really_inline m256 xor256(m256 a, m256 b) {
+    return _mm256_xor_si256(a, b);
+}
+#else
+static really_inline m256 xor256(m256 a, m256 b) {
+    m256 rv;
+    rv.lo = xor128(a.lo, b.lo);
+    rv.hi = xor128(a.hi, b.hi);
+    return rv;
+}
+#endif
+
+#if defined(HAVE_AVX2)
+static really_inline m256 not256(m256 a) {
+    return _mm256_xor_si256(a, ones256());
+}
+#else
+static really_inline m256 not256(m256 a) {
+    m256 rv;
+    rv.lo = not128(a.lo);
+    rv.hi = not128(a.hi);
+    return rv;
+}
+#endif
+
+#if defined(HAVE_AVX2)
+static really_inline m256 andnot256(m256 a, m256 b) {
+    return _mm256_andnot_si256(a, b);
+}
+#else
+static really_inline m256 andnot256(m256 a, m256 b) {
+    m256 rv;
+    rv.lo = andnot128(a.lo, b.lo);
+    rv.hi = andnot128(a.hi, b.hi);
+    return rv;
+}
+#endif
+
+static really_inline int diff256(m256 a, m256 b) {
+#if defined(HAVE_AVX2)
+    return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-1);
+#else
+    return diff128(a.lo, b.lo) || diff128(a.hi, b.hi);
+#endif
+}
+
+static really_inline int isnonzero256(m256 a) {
+#if defined(HAVE_AVX2)
+    return !!diff256(a, zeroes256());
+#else
+    return isnonzero128(or128(a.lo, a.hi));
+#endif
+}
+
+/**
+ * "Rich" version of diff256(). Takes two vectors a and b and returns an 8-bit
+ * mask indicating which 32-bit words contain differences.
+ */
+static really_inline u32 diffrich256(m256 a, m256 b) {
+#if defined(HAVE_AVX2)
+    a = _mm256_cmpeq_epi32(a, b);
+    return ~(_mm256_movemask_ps(_mm256_castsi256_ps(a))) & 0xFF;
+#else
+    m128 z = zeroes128();
+    a.lo = _mm_cmpeq_epi32(a.lo, b.lo);
+    a.hi = _mm_cmpeq_epi32(a.hi, b.hi);
+    m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.hi), z);
+    return ~(_mm_movemask_epi8(packed)) & 0xff;
+#endif
+}
+
+/**
+ * "Rich" version of diff256(), 64-bit variant. Takes two vectors a and b and
+ * returns an 8-bit mask indicating which 64-bit words contain differences.
+ */
+static really_inline u32 diffrich64_256(m256 a, m256 b) {
+    u32 d = diffrich256(a, b);
+    return (d | (d >> 1)) & 0x55555555;
+}
+
+// aligned load
+static really_inline m256 load256(const void *ptr) {
+    assert(ISALIGNED_N(ptr, alignof(m256)));
+#if defined(HAVE_AVX2)
+    return _mm256_load_si256((const m256 *)ptr);
+#else
+    m256 rv = { load128(ptr), load128((const char *)ptr + 16) };
+    return rv;
+#endif
+}
+
+// aligned load  of 128-bit value to low and high part of 256-bit value
+static really_inline m256 load2x128(const void *ptr) {
+#if defined(HAVE_AVX2)
+    return set2x128(load128(ptr));
+#else
+    assert(ISALIGNED_N(ptr, alignof(m128)));
+    m256 rv;
+    rv.hi = rv.lo = load128(ptr);
+    return rv;
+#endif
+}
+
+static really_inline m256 loadu2x128(const void *ptr) {
+    return set2x128(loadu128(ptr));
+}
+
+// aligned store
+static really_inline void store256(void *ptr, m256 a) {
+    assert(ISALIGNED_N(ptr, alignof(m256)));
+#if defined(HAVE_AVX2)
+    _mm256_store_si256((m256 *)ptr, a);
+#else
+    ptr = assume_aligned(ptr, 16);
+    *(m256 *)ptr = a;
+#endif
+}
+
+// unaligned load
+static really_inline m256 loadu256(const void *ptr) {
+#if defined(HAVE_AVX2)
+    return _mm256_loadu_si256((const m256 *)ptr);
+#else
+    m256 rv = { loadu128(ptr), loadu128((const char *)ptr + 16) };
+    return rv;
+#endif
+}
+
+// unaligned store
+static really_inline void storeu256(void *ptr, m256 a) {
+#if defined(HAVE_AVX2)
+    _mm256_storeu_si256((m256 *)ptr, a);
+#else
+    storeu128(ptr, a.lo);
+    storeu128((char *)ptr + 16, a.hi);
+#endif
+}
+
+// packed unaligned store of first N bytes
+static really_inline
+void storebytes256(void *ptr, m256 a, unsigned int n) {
+    assert(n <= sizeof(a));
+    memcpy(ptr, &a, n);
+}
+
+// packed unaligned load of first N bytes, pad with zero
+static really_inline
+m256 loadbytes256(const void *ptr, unsigned int n) {
+    m256 a = zeroes256();
+    assert(n <= sizeof(a));
+    memcpy(&a, ptr, n);
+    return a;
+}
+
+static really_inline
+m256 mask1bit256(unsigned int n) {
+    assert(n < sizeof(m256) * 8);
+    u32 mask_idx = ((n % 8) * 64) + 95;
+    mask_idx -= n / 8;
+    return loadu256(&simd_onebit_masks[mask_idx]);
+}
+
+static really_inline
+m256 set64x4(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) {
+#if defined(HAVE_AVX2)
+    return _mm256_set_epi64x(hi_1, hi_0, lo_1, lo_0);
+#else
+    m256 rv;
+    rv.hi = set64x2(hi_1, hi_0);
+    rv.lo = set64x2(lo_1, lo_0);
+    return rv;
+#endif
+}
+
+#if !defined(HAVE_AVX2)
+// switches on bit N in the given vector.
+static really_inline
+void setbit256(m256 *ptr, unsigned int n) {
+    assert(n < sizeof(*ptr) * 8);
+    m128 *sub;
+    if (n < 128) {
+        sub = &ptr->lo;
+    } else {
+        sub = &ptr->hi;
+        n -= 128;
+    }
+    setbit128(sub, n);
+}
+
+// switches off bit N in the given vector.
+static really_inline
+void clearbit256(m256 *ptr, unsigned int n) {
+    assert(n < sizeof(*ptr) * 8);
+    m128 *sub;
+    if (n < 128) {
+        sub = &ptr->lo;
+    } else {
+        sub = &ptr->hi;
+        n -= 128;
+    }
+    clearbit128(sub, n);
+}
+
+// tests bit N in the given vector.
+static really_inline
+char testbit256(m256 val, unsigned int n) {
+    assert(n < sizeof(val) * 8);
+    m128 sub;
+    if (n < 128) {
+        sub = val.lo;
+    } else {
+        sub = val.hi;
+        n -= 128;
+    }
+    return testbit128(sub, n);
+}
+
+static really_really_inline
+m128 movdq_hi(m256 x) {
+    return x.hi;
+}
+
+static really_really_inline
+m128 movdq_lo(m256 x) {
+    return x.lo;
+}
+
+static really_inline
+m256 combine2x128(m128 hi, m128 lo) {
+    m256 rv = {lo, hi};
+    return rv;
+}
+
+#else // AVX2
+
+// switches on bit N in the given vector.
+static really_inline
+void setbit256(m256 *ptr, unsigned int n) {
+    *ptr = or256(mask1bit256(n), *ptr);
+}
+
+static really_inline
+void clearbit256(m256 *ptr, unsigned int n) {
+    *ptr = andnot256(mask1bit256(n), *ptr);
+}
+
+// tests bit N in the given vector.
+static really_inline
+char testbit256(m256 val, unsigned int n) {
+    const m256 mask = mask1bit256(n);
+    return !_mm256_testz_si256(mask, val);
+}
+
+static really_really_inline
+m128 movdq_hi(m256 x) {
+    return _mm256_extracti128_si256(x, 1);
+}
+
+static really_really_inline
+m128 movdq_lo(m256 x) {
+    return _mm256_extracti128_si256(x, 0);
+}
+
+#define cast256to128(a) _mm256_castsi256_si128(a)
+#define cast128to256(a) _mm256_castsi128_si256(a)
+#define swap128in256(a) _mm256_permute4x64_epi64(a, 0x4E)
+#define insert128to256(a, b, imm) _mm256_inserti128_si256(a, b, imm)
+#define rshift128_m256(a, count_immed) _mm256_srli_si256(a, count_immed)
+#define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed)
+#define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2)
+#define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4)
+#define extractlow64from256(a) _mm_cvtsi128_si64(cast256to128(a))
+#define extractlow32from256(a) movd(cast256to128(a))
+#define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b)
+#define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b)
+#define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset)
+
+static really_inline
+m256 combine2x128(m128 hi, m128 lo) {
+#if defined(_mm256_set_m128i)
+    return _mm256_set_m128i(hi, lo);
+#else
+    return insert128to256(cast128to256(lo), hi, 1);
+#endif
+}
+#endif //AVX2
+
+#if defined(HAVE_AVX512)
+#define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm)
+#define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b)
+#define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b)
+#define set2x256(a) _mm512_broadcast_i64x4(a)
+#define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a)
+#define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a)
+#endif
+
+/****
+ **** 384-bit Primitives
+ ****/
+
+static really_inline m384 and384(m384 a, m384 b) {
+    m384 rv;
+    rv.lo = and128(a.lo, b.lo);
+    rv.mid = and128(a.mid, b.mid);
+    rv.hi = and128(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline m384 or384(m384 a, m384 b) {
+    m384 rv;
+    rv.lo = or128(a.lo, b.lo);
+    rv.mid = or128(a.mid, b.mid);
+    rv.hi = or128(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline m384 xor384(m384 a, m384 b) {
+    m384 rv;
+    rv.lo = xor128(a.lo, b.lo);
+    rv.mid = xor128(a.mid, b.mid);
+    rv.hi = xor128(a.hi, b.hi);
+    return rv;
+}
+static really_inline m384 not384(m384 a) {
+    m384 rv;
+    rv.lo = not128(a.lo);
+    rv.mid = not128(a.mid);
+    rv.hi = not128(a.hi);
+    return rv;
+}
+static really_inline m384 andnot384(m384 a, m384 b) {
+    m384 rv;
+    rv.lo = andnot128(a.lo, b.lo);
+    rv.mid = andnot128(a.mid, b.mid);
+    rv.hi = andnot128(a.hi, b.hi);
+    return rv;
+}
+
+static really_really_inline
+m384 lshift64_m384(m384 a, unsigned b) {
+    m384 rv;
+    rv.lo = lshift64_m128(a.lo, b);
+    rv.mid = lshift64_m128(a.mid, b);
+    rv.hi = lshift64_m128(a.hi, b);
+    return rv;
+}
+
+static really_inline m384 zeroes384(void) {
+    m384 rv = {zeroes128(), zeroes128(), zeroes128()};
+    return rv;
+}
+
+static really_inline m384 ones384(void) {
+    m384 rv = {ones128(), ones128(), ones128()};
+    return rv;
+}
+
+static really_inline int diff384(m384 a, m384 b) {
+    return diff128(a.lo, b.lo) || diff128(a.mid, b.mid) || diff128(a.hi, b.hi);
+}
+
+static really_inline int isnonzero384(m384 a) {
+    return isnonzero128(or128(or128(a.lo, a.mid), a.hi));
+}
+
+/**
+ * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit
+ * mask indicating which 32-bit words contain differences.
+ */
+static really_inline u32 diffrich384(m384 a, m384 b) {
+    m128 z = zeroes128();
+    a.lo = _mm_cmpeq_epi32(a.lo, b.lo);
+    a.mid = _mm_cmpeq_epi32(a.mid, b.mid);
+    a.hi = _mm_cmpeq_epi32(a.hi, b.hi);
+    m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.mid),
+                                  _mm_packs_epi32(a.hi, z));
+    return ~(_mm_movemask_epi8(packed)) & 0xfff;
+}
+
+/**
+ * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and
+ * returns a 12-bit mask indicating which 64-bit words contain differences.
+ */
+static really_inline u32 diffrich64_384(m384 a, m384 b) {
+    u32 d = diffrich384(a, b);
+    return (d | (d >> 1)) & 0x55555555;
+}
+
+// aligned load
+static really_inline m384 load384(const void *ptr) {
+    assert(ISALIGNED_16(ptr));
+    m384 rv = { load128(ptr), load128((const char *)ptr + 16),
+                load128((const char *)ptr + 32) };
+    return rv;
+}
+
+// aligned store
+static really_inline void store384(void *ptr, m384 a) {
+    assert(ISALIGNED_16(ptr));
+    ptr = assume_aligned(ptr, 16);
+    *(m384 *)ptr = a;
+}
+
+// unaligned load
+static really_inline m384 loadu384(const void *ptr) {
+    m384 rv = { loadu128(ptr), loadu128((const char *)ptr + 16),
+                loadu128((const char *)ptr + 32)};
+    return rv;
+}
+
+// packed unaligned store of first N bytes
+static really_inline
+void storebytes384(void *ptr, m384 a, unsigned int n) {
+    assert(n <= sizeof(a));
+    memcpy(ptr, &a, n);
+}
+
+// packed unaligned load of first N bytes, pad with zero
+static really_inline
+m384 loadbytes384(const void *ptr, unsigned int n) {
+    m384 a = zeroes384();
+    assert(n <= sizeof(a));
+    memcpy(&a, ptr, n);
+    return a;
+}
+
+// switches on bit N in the given vector.
+static really_inline
+void setbit384(m384 *ptr, unsigned int n) {
+    assert(n < sizeof(*ptr) * 8);
+    m128 *sub;
+    if (n < 128) {
+        sub = &ptr->lo;
+    } else if (n < 256) {
+        sub = &ptr->mid;
+    } else {
+        sub = &ptr->hi;
+    }
+    setbit128(sub, n % 128);
+}
+
+// switches off bit N in the given vector.
+static really_inline
+void clearbit384(m384 *ptr, unsigned int n) {
+    assert(n < sizeof(*ptr) * 8);
+    m128 *sub;
+    if (n < 128) {
+        sub = &ptr->lo;
+    } else if (n < 256) {
+        sub = &ptr->mid;
+    } else {
+        sub = &ptr->hi;
+    }
+    clearbit128(sub, n % 128);
+}
+
+// tests bit N in the given vector.
+static really_inline
+char testbit384(m384 val, unsigned int n) {
+    assert(n < sizeof(val) * 8);
+    m128 sub;
+    if (n < 128) {
+        sub = val.lo;
+    } else if (n < 256) {
+        sub = val.mid;
+    } else {
+        sub = val.hi;
+    }
+    return testbit128(sub, n % 128);
+}
+
+/****
+ **** 512-bit Primitives
+ ****/
+
+#define eq512mask(a, b) _mm512_cmpeq_epi8_mask((a), (b))
+#define masked_eq512mask(k, a, b) _mm512_mask_cmpeq_epi8_mask((k), (a), (b))
+
+static really_inline
+m512 zeroes512(void) {
+#if defined(HAVE_AVX512)
+    return _mm512_setzero_si512();
+#else
+    m512 rv = {zeroes256(), zeroes256()};
+    return rv;
+#endif
+}
+
+static really_inline
+m512 ones512(void) {
+#if defined(HAVE_AVX512)
+    return _mm512_set1_epi8(0xFF);
+    //return _mm512_xor_si512(_mm512_setzero_si512(), _mm512_setzero_si512());
+#else
+    m512 rv = {ones256(), ones256()};
+    return rv;
+#endif
+}
+
+#if defined(HAVE_AVX512)
+static really_inline
+m512 set64x8(u8 a) {
+    return _mm512_set1_epi8(a);
+}
+
+static really_inline
+m512 set8x64(u64a a) {
+    return _mm512_set1_epi64(a);
+}
+
+static really_inline
+m512 set512_64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0,
+               u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) {
+    return _mm512_set_epi64(hi_3, hi_2, hi_1, hi_0,
+                            lo_3, lo_2, lo_1, lo_0);
+}
+
+static really_inline
+m512 swap256in512(m512 a) {
+    m512 idx = set512_64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL);
+    return vpermq512(idx, a);
+}
+
+static really_inline
+m512 set4x128(m128 a) {
+    return _mm512_broadcast_i32x4(a);
+}
+#endif
+
+static really_inline
+m512 and512(m512 a, m512 b) {
+#if defined(HAVE_AVX512)
+    return _mm512_and_si512(a, b);
+#else
+    m512 rv;
+    rv.lo = and256(a.lo, b.lo);
+    rv.hi = and256(a.hi, b.hi);
+    return rv;
+#endif
+}
+
+static really_inline
+m512 or512(m512 a, m512 b) {
+#if defined(HAVE_AVX512)
+    return _mm512_or_si512(a, b);
+#else
+    m512 rv;
+    rv.lo = or256(a.lo, b.lo);
+    rv.hi = or256(a.hi, b.hi);
+    return rv;
+#endif
+}
+
+static really_inline
+m512 xor512(m512 a, m512 b) {
+#if defined(HAVE_AVX512)
+    return _mm512_xor_si512(a, b);
+#else
+    m512 rv;
+    rv.lo = xor256(a.lo, b.lo);
+    rv.hi = xor256(a.hi, b.hi);
+    return rv;
+#endif
+}
+
+static really_inline
+m512 not512(m512 a) {
+#if defined(HAVE_AVX512)
+    return _mm512_xor_si512(a, ones512());
+#else
+    m512 rv;
+    rv.lo = not256(a.lo);
+    rv.hi = not256(a.hi);
+    return rv;
+#endif
+}
+
+static really_inline
+m512 andnot512(m512 a, m512 b) {
+#if defined(HAVE_AVX512)
+    return _mm512_andnot_si512(a, b);
+#else
+    m512 rv;
+    rv.lo = andnot256(a.lo, b.lo);
+    rv.hi = andnot256(a.hi, b.hi);
+    return rv;
+#endif
+}
+
+#if defined(HAVE_AVX512)
+static really_really_inline
+m512 lshift64_m512(m512 a, unsigned b) {
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(b)) {
+        return _mm512_slli_epi64(a, b);
+    }
+#endif
+    m128 x = _mm_cvtsi32_si128(b);
+    return _mm512_sll_epi64(a, x);
+}
+#else
+static really_really_inline
+m512 lshift64_m512(m512 a, unsigned b) {
+    m512 rv;
+    rv.lo = lshift64_m256(a.lo, b);
+    rv.hi = lshift64_m256(a.hi, b);
+    return rv;
+}
+#endif
+
+#if defined(HAVE_AVX512)
+#define rshift64_m512(a, b) _mm512_srli_epi64((a), (b))
+#define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed)
+#define lshift128_m512(a, count_immed) _mm512_bslli_epi128(a, count_immed)
+#endif
+
+#if !defined(_MM_CMPINT_NE)
+#define _MM_CMPINT_NE 0x4
+#endif
+
+static really_inline
+int diff512(m512 a, m512 b) {
+#if defined(HAVE_AVX512)
+    return !!_mm512_cmp_epi8_mask(a, b, _MM_CMPINT_NE);
+#else
+    return diff256(a.lo, b.lo) || diff256(a.hi, b.hi);
+#endif
+}
+
+static really_inline
+int isnonzero512(m512 a) {
+#if defined(HAVE_AVX512)
+    return diff512(a, zeroes512());
+#elif defined(HAVE_AVX2)
+    m256 x = or256(a.lo, a.hi);
+    return !!diff256(x, zeroes256());
+#else
+    m128 x = or128(a.lo.lo, a.lo.hi);
+    m128 y = or128(a.hi.lo, a.hi.hi);
+    return isnonzero128(or128(x, y));
+#endif
+}
+
+/**
+ * "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit
+ * mask indicating which 32-bit words contain differences.
+ */
+static really_inline
+u32 diffrich512(m512 a, m512 b) {
+#if defined(HAVE_AVX512)
+    return _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_NE);
+#elif defined(HAVE_AVX2)
+    return diffrich256(a.lo, b.lo) | (diffrich256(a.hi, b.hi) << 8);
+#else
+    a.lo.lo = _mm_cmpeq_epi32(a.lo.lo, b.lo.lo);
+    a.lo.hi = _mm_cmpeq_epi32(a.lo.hi, b.lo.hi);
+    a.hi.lo = _mm_cmpeq_epi32(a.hi.lo, b.hi.lo);
+    a.hi.hi = _mm_cmpeq_epi32(a.hi.hi, b.hi.hi);
+    m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo.lo, a.lo.hi),
+                                  _mm_packs_epi32(a.hi.lo, a.hi.hi));
+    return ~(_mm_movemask_epi8(packed)) & 0xffff;
+#endif
+}
+
+/**
+ * "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and
+ * returns a 16-bit mask indicating which 64-bit words contain differences.
+ */
+static really_inline
+u32 diffrich64_512(m512 a, m512 b) {
+    //TODO: cmp_epi64?
+    u32 d = diffrich512(a, b);
+    return (d | (d >> 1)) & 0x55555555;
+}
+
+// aligned load
+static really_inline
+m512 load512(const void *ptr) {
+#if defined(HAVE_AVX512)
+    return _mm512_load_si512(ptr);
+#else
+    assert(ISALIGNED_N(ptr, alignof(m256)));
+    m512 rv = { load256(ptr), load256((const char *)ptr + 32) };
+    return rv;
+#endif
+}
+
+// aligned store
+static really_inline
+void store512(void *ptr, m512 a) {
+    assert(ISALIGNED_N(ptr, alignof(m512)));
+#if defined(HAVE_AVX512)
+    return _mm512_store_si512(ptr, a);
+#elif defined(HAVE_AVX2)
+    m512 *x = (m512 *)ptr;
+    store256(&x->lo, a.lo);
+    store256(&x->hi, a.hi);
+#else
+    ptr = assume_aligned(ptr, 16);
+    *(m512 *)ptr = a;
+#endif
+}
+
+// unaligned load
+static really_inline
+m512 loadu512(const void *ptr) {
+#if defined(HAVE_AVX512)
+    return _mm512_loadu_si512(ptr);
+#else
+    m512 rv = { loadu256(ptr), loadu256((const char *)ptr + 32) };
+    return rv;
+#endif
+}
+
+#if defined(HAVE_AVX512)
+static really_inline
+m512 loadu_maskz_m512(__mmask64 k, const void *ptr) {
+    return _mm512_maskz_loadu_epi8(k, ptr);
+}
+
+static really_inline
+m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) {
+    return _mm512_mask_loadu_epi8(src, k, ptr);
+}
+
+static really_inline
+m512 set_mask_m512(__mmask64 k) {
+    return _mm512_movm_epi8(k);
+}
+#endif
+
+// packed unaligned store of first N bytes
+static really_inline
+void storebytes512(void *ptr, m512 a, unsigned int n) {
+    assert(n <= sizeof(a));
+    memcpy(ptr, &a, n);
+}
+
+// packed unaligned load of first N bytes, pad with zero
+static really_inline
+m512 loadbytes512(const void *ptr, unsigned int n) {
+    m512 a = zeroes512();
+    assert(n <= sizeof(a));
+    memcpy(&a, ptr, n);
+    return a;
+}
+
+static really_inline
+m512 mask1bit512(unsigned int n) {
+    assert(n < sizeof(m512) * 8);
+    u32 mask_idx = ((n % 8) * 64) + 95;
+    mask_idx -= n / 8;
+    return loadu512(&simd_onebit_masks[mask_idx]);
+}
+
+// switches on bit N in the given vector.
+static really_inline
+void setbit512(m512 *ptr, unsigned int n) {
+    assert(n < sizeof(*ptr) * 8);
+#if !defined(HAVE_AVX2)
+    m128 *sub;
+    if (n < 128) {
+        sub = &ptr->lo.lo;
+    } else if (n < 256) {
+        sub = &ptr->lo.hi;
+    } else if (n < 384) {
+        sub = &ptr->hi.lo;
+    } else {
+        sub = &ptr->hi.hi;
+    }
+    setbit128(sub, n % 128);
+#elif defined(HAVE_AVX512)
+    *ptr = or512(mask1bit512(n), *ptr);
+#else
+    m256 *sub;
+    if (n < 256) {
+        sub = &ptr->lo;
+    } else {
+        sub = &ptr->hi;
+        n -= 256;
+    }
+    setbit256(sub, n);
+#endif
+}
+
+// switches off bit N in the given vector.
+static really_inline
+void clearbit512(m512 *ptr, unsigned int n) {
+    assert(n < sizeof(*ptr) * 8);
+#if !defined(HAVE_AVX2)
+    m128 *sub;
+    if (n < 128) {
+        sub = &ptr->lo.lo;
+    } else if (n < 256) {
+        sub = &ptr->lo.hi;
+    } else if (n < 384) {
+        sub = &ptr->hi.lo;
+    } else {
+        sub = &ptr->hi.hi;
+    }
+    clearbit128(sub, n % 128);
+#elif defined(HAVE_AVX512)
+    *ptr = andnot512(mask1bit512(n), *ptr);
+#else
+    m256 *sub;
+    if (n < 256) {
+        sub = &ptr->lo;
+    } else {
+        sub = &ptr->hi;
+        n -= 256;
+    }
+    clearbit256(sub, n);
+#endif
+}
+
+// tests bit N in the given vector.
+static really_inline
+char testbit512(m512 val, unsigned int n) {
+    assert(n < sizeof(val) * 8);
+#if !defined(HAVE_AVX2)
+    m128 sub;
+    if (n < 128) {
+        sub = val.lo.lo;
+    } else if (n < 256) {
+        sub = val.lo.hi;
+    } else if (n < 384) {
+        sub = val.hi.lo;
+    } else {
+        sub = val.hi.hi;
+    }
+    return testbit128(sub, n % 128);
+#elif defined(HAVE_AVX512)
+    const m512 mask = mask1bit512(n);
+    return !!_mm512_test_epi8_mask(mask, val);
+#else
+    m256 sub;
+    if (n < 256) {
+        sub = val.lo;
+    } else {
+        sub = val.hi;
+        n -= 256;
+    }
+    return testbit256(sub, n);
+#endif
+}
+
+#endif // ARCH_X86_SIMD_UTILS_H
diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h
index 671a5bab..019dc125 100644
--- a/src/util/simd_utils.h
+++ b/src/util/simd_utils.h
@@ -30,21 +30,11 @@
  * \brief SIMD types and primitive operations.
  */
 
-#ifndef SIMD_UTILS
-#define SIMD_UTILS
-
-#if !defined(_WIN32) && !defined(__SSSE3__)
-#error SSSE3 instructions must be enabled
-#endif
+#ifndef SIMD_UTILS_H
+#define SIMD_UTILS_H
 
 #include "config.h"
 #include "util/arch.h"
-#include "ue2common.h"
-#include "simd_types.h"
-#include "unaligned.h"
-#include "util/intrinsics.h"
-
-#include <string.h> // for memcpy
 
 // Define a common assume_aligned using an appropriate compiler built-in, if
 // it's available. Note that we need to handle C or C++ compilation.
@@ -71,1269 +61,8 @@ extern const char vbs_mask_data[];
 }
 #endif
 
-static really_inline m128 ones128(void) {
-#if defined(__GNUC__) || defined(__INTEL_COMPILER)
-    /* gcc gets this right */
-    return _mm_set1_epi8(0xFF);
-#else
-    /* trick from Intel's optimization guide to generate all-ones.
-     * ICC converts this to the single cmpeq instruction */
-    return _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128());
-#endif
-}
-
-static really_inline m128 zeroes128(void) {
-    return _mm_setzero_si128();
-}
-
-/** \brief Bitwise not for m128*/
-static really_inline m128 not128(m128 a) {
-    return _mm_xor_si128(a, ones128());
-}
-
-/** \brief Return 1 if a and b are different otherwise 0 */
-static really_inline int diff128(m128 a, m128 b) {
-    return (_mm_movemask_epi8(_mm_cmpeq_epi8(a, b)) ^ 0xffff);
-}
-
-static really_inline int isnonzero128(m128 a) {
-    return !!diff128(a, zeroes128());
-}
-
-/**
- * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit
- * mask indicating which 32-bit words contain differences.
- */
-static really_inline u32 diffrich128(m128 a, m128 b) {
-    a = _mm_cmpeq_epi32(a, b);
-    return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0xf;
-}
-
-/**
- * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and
- * returns a 4-bit mask indicating which 64-bit words contain differences.
- */
-static really_inline u32 diffrich64_128(m128 a, m128 b) {
-#if defined(HAVE_SSE41)
-    a = _mm_cmpeq_epi64(a, b);
-    return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0x5;
-#else
-    u32 d = diffrich128(a, b);
-    return (d | (d >> 1)) & 0x5;
-#endif
-}
-
-static really_really_inline
-m128 lshift64_m128(m128 a, unsigned b) {
-#if defined(HAVE__BUILTIN_CONSTANT_P)
-    if (__builtin_constant_p(b)) {
-        return _mm_slli_epi64(a, b);
-    }
-#endif
-    m128 x = _mm_cvtsi32_si128(b);
-    return _mm_sll_epi64(a, x);
-}
-
-#define rshift64_m128(a, b) _mm_srli_epi64((a), (b))
-#define eq128(a, b)      _mm_cmpeq_epi8((a), (b))
-#define movemask128(a)  ((u32)_mm_movemask_epi8((a)))
-
-static really_inline m128 set16x8(u8 c) {
-    return _mm_set1_epi8(c);
-}
-
-static really_inline m128 set4x32(u32 c) {
-    return _mm_set1_epi32(c);
-}
-
-static really_inline u32 movd(const m128 in) {
-    return _mm_cvtsi128_si32(in);
-}
-
-#if defined(HAVE_AVX512)
-static really_inline u32 movd512(const m512 in) {
-    // NOTE: seems gcc doesn't support _mm512_cvtsi512_si32(in),
-    //       so we use 2-step convertions to work around.
-    return _mm_cvtsi128_si32(_mm512_castsi512_si128(in));
-}
+#if defined(ARCH_IA32) || defined(ARCH_X86_64)
+#include "util/arch/x86/simd_utils.h"
 #endif
 
-static really_inline u64a movq(const m128 in) {
-#if defined(ARCH_X86_64)
-    return _mm_cvtsi128_si64(in);
-#else // 32-bit - this is horrific
-    u32 lo = movd(in);
-    u32 hi = movd(_mm_srli_epi64(in, 32));
-    return (u64a)hi << 32 | lo;
-#endif
-}
-
-/* another form of movq */
-static really_inline
-m128 load_m128_from_u64a(const u64a *p) {
-    return _mm_set_epi64x(0LL, *p);
-}
-
-#define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed)
-#define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed)
-
-#if defined(HAVE_SSE41)
-#define extract32from128(a, imm) _mm_extract_epi32(a, imm)
-#define extract64from128(a, imm) _mm_extract_epi64(a, imm)
-#else
-#define extract32from128(a, imm) movd(_mm_srli_si128(a, imm << 2))
-#define extract64from128(a, imm) movq(_mm_srli_si128(a, imm << 3))
-#endif
-
-#if !defined(HAVE_AVX2)
-// TODO: this entire file needs restructuring - this carveout is awful
-#define extractlow64from256(a) movq(a.lo)
-#define extractlow32from256(a) movd(a.lo)
-#if defined(HAVE_SSE41)
-#define extract32from256(a, imm) _mm_extract_epi32((imm >> 2) ? a.hi : a.lo, imm % 4)
-#define extract64from256(a, imm) _mm_extract_epi64((imm >> 1) ? a.hi : a.lo, imm % 2)
-#else
-#define extract32from256(a, imm) movd(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 4))
-#define extract64from256(a, imm) movq(_mm_srli_si128((imm >> 1) ? a.hi : a.lo, (imm % 2) * 8))
-#endif
-
-#endif // !AVX2
-
-static really_inline m128 and128(m128 a, m128 b) {
-    return _mm_and_si128(a,b);
-}
-
-static really_inline m128 xor128(m128 a, m128 b) {
-    return _mm_xor_si128(a,b);
-}
-
-static really_inline m128 or128(m128 a, m128 b) {
-    return _mm_or_si128(a,b);
-}
-
-static really_inline m128 andnot128(m128 a, m128 b) {
-    return _mm_andnot_si128(a, b);
-}
-
-// aligned load
-static really_inline m128 load128(const void *ptr) {
-    assert(ISALIGNED_N(ptr, alignof(m128)));
-    ptr = assume_aligned(ptr, 16);
-    return _mm_load_si128((const m128 *)ptr);
-}
-
-// aligned store
-static really_inline void store128(void *ptr, m128 a) {
-    assert(ISALIGNED_N(ptr, alignof(m128)));
-    ptr = assume_aligned(ptr, 16);
-    *(m128 *)ptr = a;
-}
-
-// unaligned load
-static really_inline m128 loadu128(const void *ptr) {
-    return _mm_loadu_si128((const m128 *)ptr);
-}
-
-// unaligned store
-static really_inline void storeu128(void *ptr, m128 a) {
-    _mm_storeu_si128 ((m128 *)ptr, a);
-}
-
-// packed unaligned store of first N bytes
-static really_inline
-void storebytes128(void *ptr, m128 a, unsigned int n) {
-    assert(n <= sizeof(a));
-    memcpy(ptr, &a, n);
-}
-
-// packed unaligned load of first N bytes, pad with zero
-static really_inline
-m128 loadbytes128(const void *ptr, unsigned int n) {
-    m128 a = zeroes128();
-    assert(n <= sizeof(a));
-    memcpy(&a, ptr, n);
-    return a;
-}
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-extern const u8 simd_onebit_masks[];
-#ifdef __cplusplus
-}
-#endif
-
-static really_inline
-m128 mask1bit128(unsigned int n) {
-    assert(n < sizeof(m128) * 8);
-    u32 mask_idx = ((n % 8) * 64) + 95;
-    mask_idx -= n / 8;
-    return loadu128(&simd_onebit_masks[mask_idx]);
-}
-
-// switches on bit N in the given vector.
-static really_inline
-void setbit128(m128 *ptr, unsigned int n) {
-    *ptr = or128(mask1bit128(n), *ptr);
-}
-
-// switches off bit N in the given vector.
-static really_inline
-void clearbit128(m128 *ptr, unsigned int n) {
-    *ptr = andnot128(mask1bit128(n), *ptr);
-}
-
-// tests bit N in the given vector.
-static really_inline
-char testbit128(m128 val, unsigned int n) {
-    const m128 mask = mask1bit128(n);
-#if defined(HAVE_SSE41)
-    return !_mm_testz_si128(mask, val);
-#else
-    return isnonzero128(and128(mask, val));
-#endif
-}
-
-// offset must be an immediate
-#define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset)
-
-static really_inline
-m128 pshufb_m128(m128 a, m128 b) {
-    m128 result;
-    result = _mm_shuffle_epi8(a, b);
-    return result;
-}
-
-static really_inline
-m256 pshufb_m256(m256 a, m256 b) {
-#if defined(HAVE_AVX2)
-    return _mm256_shuffle_epi8(a, b);
-#else
-    m256 rv;
-    rv.lo = pshufb_m128(a.lo, b.lo);
-    rv.hi = pshufb_m128(a.hi, b.hi);
-    return rv;
-#endif
-}
-
-#if defined(HAVE_AVX512)
-static really_inline
-m512 pshufb_m512(m512 a, m512 b) {
-    return _mm512_shuffle_epi8(a, b);
-}
-
-static really_inline
-m512 maskz_pshufb_m512(__mmask64 k, m512 a, m512 b) {
-    return _mm512_maskz_shuffle_epi8(k, a, b);
-}
-
-#if defined(HAVE_AVX512VBMI)
-#define vpermb512(idx, a) _mm512_permutexvar_epi8(idx, a)
-#define maskz_vpermb512(k, idx, a) _mm512_maskz_permutexvar_epi8(k, idx, a)
-#endif
-
-#endif
-
-static really_inline
-m128 variable_byte_shift_m128(m128 in, s32 amount) {
-    assert(amount >= -16 && amount <= 16);
-    m128 shift_mask = loadu128(vbs_mask_data + 16 - amount);
-    return pshufb_m128(in, shift_mask);
-}
-
-static really_inline
-m128 max_u8_m128(m128 a, m128 b) {
-    return _mm_max_epu8(a, b);
-}
-
-static really_inline
-m128 min_u8_m128(m128 a, m128 b) {
-    return _mm_min_epu8(a, b);
-}
-
-static really_inline
-m128 sadd_u8_m128(m128 a, m128 b) {
-    return _mm_adds_epu8(a, b);
-}
-
-static really_inline
-m128 sub_u8_m128(m128 a, m128 b) {
-    return _mm_sub_epi8(a, b);
-}
-
-static really_inline
-m128 set64x2(u64a hi, u64a lo) {
-    return _mm_set_epi64x(hi, lo);
-}
-
-/****
- **** 256-bit Primitives
- ****/
-
-#if defined(HAVE_AVX2)
-
-static really_really_inline
-m256 lshift64_m256(m256 a, unsigned b) {
-#if defined(HAVE__BUILTIN_CONSTANT_P)
-    if (__builtin_constant_p(b)) {
-        return _mm256_slli_epi64(a, b);
-    }
-#endif
-    m128 x = _mm_cvtsi32_si128(b);
-    return _mm256_sll_epi64(a, x);
-}
-
-#define rshift64_m256(a, b) _mm256_srli_epi64((a), (b))
-
-static really_inline
-m256 set32x8(u32 in) {
-    return _mm256_set1_epi8(in);
-}
-
-#define eq256(a, b)     _mm256_cmpeq_epi8((a), (b))
-#define movemask256(a)  ((u32)_mm256_movemask_epi8((a)))
-
-static really_inline
-m256 set2x128(m128 a) {
-    return _mm256_broadcastsi128_si256(a);
-}
-
-#else
-
-static really_really_inline
-m256 lshift64_m256(m256 a, int b) {
-    m256 rv = a;
-    rv.lo = lshift64_m128(rv.lo, b);
-    rv.hi = lshift64_m128(rv.hi, b);
-    return rv;
-}
-
-static really_inline
-m256 rshift64_m256(m256 a, int b) {
-    m256 rv = a;
-    rv.lo = rshift64_m128(rv.lo, b);
-    rv.hi = rshift64_m128(rv.hi, b);
-    return rv;
-}
-static really_inline
-m256 set32x8(u32 in) {
-    m256 rv;
-    rv.lo = set16x8((u8) in);
-    rv.hi = rv.lo;
-    return rv;
-}
-
-static really_inline
-m256 eq256(m256 a, m256 b) {
-    m256 rv;
-    rv.lo = eq128(a.lo, b.lo);
-    rv.hi = eq128(a.hi, b.hi);
-    return rv;
-}
-
-static really_inline
-u32 movemask256(m256 a) {
-    u32 lo_mask = movemask128(a.lo);
-    u32 hi_mask = movemask128(a.hi);
-    return lo_mask | (hi_mask << 16);
-}
-
-static really_inline
-m256 set2x128(m128 a) {
-    m256 rv = {a, a};
-    return rv;
-}
-#endif
-
-static really_inline m256 zeroes256(void) {
-#if defined(HAVE_AVX2)
-    return _mm256_setzero_si256();
-#else
-    m256 rv = {zeroes128(), zeroes128()};
-    return rv;
-#endif
-}
-
-static really_inline m256 ones256(void) {
-#if defined(HAVE_AVX2)
-    m256 rv = _mm256_set1_epi8(0xFF);
-#else
-    m256 rv = {ones128(), ones128()};
-#endif
-    return rv;
-}
-
-#if defined(HAVE_AVX2)
-static really_inline m256 and256(m256 a, m256 b) {
-    return _mm256_and_si256(a, b);
-}
-#else
-static really_inline m256 and256(m256 a, m256 b) {
-    m256 rv;
-    rv.lo = and128(a.lo, b.lo);
-    rv.hi = and128(a.hi, b.hi);
-    return rv;
-}
-#endif
-
-#if defined(HAVE_AVX2)
-static really_inline m256 or256(m256 a, m256 b) {
-    return _mm256_or_si256(a, b);
-}
-#else
-static really_inline m256 or256(m256 a, m256 b) {
-    m256 rv;
-    rv.lo = or128(a.lo, b.lo);
-    rv.hi = or128(a.hi, b.hi);
-    return rv;
-}
-#endif
-
-#if defined(HAVE_AVX2)
-static really_inline m256 xor256(m256 a, m256 b) {
-    return _mm256_xor_si256(a, b);
-}
-#else
-static really_inline m256 xor256(m256 a, m256 b) {
-    m256 rv;
-    rv.lo = xor128(a.lo, b.lo);
-    rv.hi = xor128(a.hi, b.hi);
-    return rv;
-}
-#endif
-
-#if defined(HAVE_AVX2)
-static really_inline m256 not256(m256 a) {
-    return _mm256_xor_si256(a, ones256());
-}
-#else
-static really_inline m256 not256(m256 a) {
-    m256 rv;
-    rv.lo = not128(a.lo);
-    rv.hi = not128(a.hi);
-    return rv;
-}
-#endif
-
-#if defined(HAVE_AVX2)
-static really_inline m256 andnot256(m256 a, m256 b) {
-    return _mm256_andnot_si256(a, b);
-}
-#else
-static really_inline m256 andnot256(m256 a, m256 b) {
-    m256 rv;
-    rv.lo = andnot128(a.lo, b.lo);
-    rv.hi = andnot128(a.hi, b.hi);
-    return rv;
-}
-#endif
-
-static really_inline int diff256(m256 a, m256 b) {
-#if defined(HAVE_AVX2)
-    return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-1);
-#else
-    return diff128(a.lo, b.lo) || diff128(a.hi, b.hi);
-#endif
-}
-
-static really_inline int isnonzero256(m256 a) {
-#if defined(HAVE_AVX2)
-    return !!diff256(a, zeroes256());
-#else
-    return isnonzero128(or128(a.lo, a.hi));
-#endif
-}
-
-/**
- * "Rich" version of diff256(). Takes two vectors a and b and returns an 8-bit
- * mask indicating which 32-bit words contain differences.
- */
-static really_inline u32 diffrich256(m256 a, m256 b) {
-#if defined(HAVE_AVX2)
-    a = _mm256_cmpeq_epi32(a, b);
-    return ~(_mm256_movemask_ps(_mm256_castsi256_ps(a))) & 0xFF;
-#else
-    m128 z = zeroes128();
-    a.lo = _mm_cmpeq_epi32(a.lo, b.lo);
-    a.hi = _mm_cmpeq_epi32(a.hi, b.hi);
-    m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.hi), z);
-    return ~(_mm_movemask_epi8(packed)) & 0xff;
-#endif
-}
-
-/**
- * "Rich" version of diff256(), 64-bit variant. Takes two vectors a and b and
- * returns an 8-bit mask indicating which 64-bit words contain differences.
- */
-static really_inline u32 diffrich64_256(m256 a, m256 b) {
-    u32 d = diffrich256(a, b);
-    return (d | (d >> 1)) & 0x55555555;
-}
-
-// aligned load
-static really_inline m256 load256(const void *ptr) {
-    assert(ISALIGNED_N(ptr, alignof(m256)));
-#if defined(HAVE_AVX2)
-    return _mm256_load_si256((const m256 *)ptr);
-#else
-    m256 rv = { load128(ptr), load128((const char *)ptr + 16) };
-    return rv;
-#endif
-}
-
-// aligned load  of 128-bit value to low and high part of 256-bit value
-static really_inline m256 load2x128(const void *ptr) {
-#if defined(HAVE_AVX2)
-    return set2x128(load128(ptr));
-#else
-    assert(ISALIGNED_N(ptr, alignof(m128)));
-    m256 rv;
-    rv.hi = rv.lo = load128(ptr);
-    return rv;
-#endif
-}
-
-static really_inline m256 loadu2x128(const void *ptr) {
-    return set2x128(loadu128(ptr));
-}
-
-// aligned store
-static really_inline void store256(void *ptr, m256 a) {
-    assert(ISALIGNED_N(ptr, alignof(m256)));
-#if defined(HAVE_AVX2)
-    _mm256_store_si256((m256 *)ptr, a);
-#else
-    ptr = assume_aligned(ptr, 16);
-    *(m256 *)ptr = a;
-#endif
-}
-
-// unaligned load
-static really_inline m256 loadu256(const void *ptr) {
-#if defined(HAVE_AVX2)
-    return _mm256_loadu_si256((const m256 *)ptr);
-#else
-    m256 rv = { loadu128(ptr), loadu128((const char *)ptr + 16) };
-    return rv;
-#endif
-}
-
-// unaligned store
-static really_inline void storeu256(void *ptr, m256 a) {
-#if defined(HAVE_AVX2)
-    _mm256_storeu_si256((m256 *)ptr, a);
-#else
-    storeu128(ptr, a.lo);
-    storeu128((char *)ptr + 16, a.hi);
-#endif
-}
-
-// packed unaligned store of first N bytes
-static really_inline
-void storebytes256(void *ptr, m256 a, unsigned int n) {
-    assert(n <= sizeof(a));
-    memcpy(ptr, &a, n);
-}
-
-// packed unaligned load of first N bytes, pad with zero
-static really_inline
-m256 loadbytes256(const void *ptr, unsigned int n) {
-    m256 a = zeroes256();
-    assert(n <= sizeof(a));
-    memcpy(&a, ptr, n);
-    return a;
-}
-
-static really_inline
-m256 mask1bit256(unsigned int n) {
-    assert(n < sizeof(m256) * 8);
-    u32 mask_idx = ((n % 8) * 64) + 95;
-    mask_idx -= n / 8;
-    return loadu256(&simd_onebit_masks[mask_idx]);
-}
-
-static really_inline
-m256 set64x4(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) {
-#if defined(HAVE_AVX2)
-    return _mm256_set_epi64x(hi_1, hi_0, lo_1, lo_0);
-#else
-    m256 rv;
-    rv.hi = set64x2(hi_1, hi_0);
-    rv.lo = set64x2(lo_1, lo_0);
-    return rv;
-#endif
-}
-
-#if !defined(HAVE_AVX2)
-// switches on bit N in the given vector.
-static really_inline
-void setbit256(m256 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-    m128 *sub;
-    if (n < 128) {
-        sub = &ptr->lo;
-    } else {
-        sub = &ptr->hi;
-        n -= 128;
-    }
-    setbit128(sub, n);
-}
-
-// switches off bit N in the given vector.
-static really_inline
-void clearbit256(m256 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-    m128 *sub;
-    if (n < 128) {
-        sub = &ptr->lo;
-    } else {
-        sub = &ptr->hi;
-        n -= 128;
-    }
-    clearbit128(sub, n);
-}
-
-// tests bit N in the given vector.
-static really_inline
-char testbit256(m256 val, unsigned int n) {
-    assert(n < sizeof(val) * 8);
-    m128 sub;
-    if (n < 128) {
-        sub = val.lo;
-    } else {
-        sub = val.hi;
-        n -= 128;
-    }
-    return testbit128(sub, n);
-}
-
-static really_really_inline
-m128 movdq_hi(m256 x) {
-    return x.hi;
-}
-
-static really_really_inline
-m128 movdq_lo(m256 x) {
-    return x.lo;
-}
-
-static really_inline
-m256 combine2x128(m128 hi, m128 lo) {
-    m256 rv = {lo, hi};
-    return rv;
-}
-
-#else // AVX2
-
-// switches on bit N in the given vector.
-static really_inline
-void setbit256(m256 *ptr, unsigned int n) {
-    *ptr = or256(mask1bit256(n), *ptr);
-}
-
-static really_inline
-void clearbit256(m256 *ptr, unsigned int n) {
-    *ptr = andnot256(mask1bit256(n), *ptr);
-}
-
-// tests bit N in the given vector.
-static really_inline
-char testbit256(m256 val, unsigned int n) {
-    const m256 mask = mask1bit256(n);
-    return !_mm256_testz_si256(mask, val);
-}
-
-static really_really_inline
-m128 movdq_hi(m256 x) {
-    return _mm256_extracti128_si256(x, 1);
-}
-
-static really_really_inline
-m128 movdq_lo(m256 x) {
-    return _mm256_extracti128_si256(x, 0);
-}
-
-#define cast256to128(a) _mm256_castsi256_si128(a)
-#define cast128to256(a) _mm256_castsi128_si256(a)
-#define swap128in256(a) _mm256_permute4x64_epi64(a, 0x4E)
-#define insert128to256(a, b, imm) _mm256_inserti128_si256(a, b, imm)
-#define rshift128_m256(a, count_immed) _mm256_srli_si256(a, count_immed)
-#define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed)
-#define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2)
-#define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4)
-#define extractlow64from256(a) _mm_cvtsi128_si64(cast256to128(a))
-#define extractlow32from256(a) movd(cast256to128(a))
-#define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b)
-#define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b)
-#define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset)
-
-static really_inline
-m256 combine2x128(m128 hi, m128 lo) {
-#if defined(_mm256_set_m128i)
-    return _mm256_set_m128i(hi, lo);
-#else
-    return insert128to256(cast128to256(lo), hi, 1);
-#endif
-}
-#endif //AVX2
-
-#if defined(HAVE_AVX512)
-#define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm)
-#define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b)
-#define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b)
-#define set2x256(a) _mm512_broadcast_i64x4(a)
-#define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a)
-#define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a)
-#endif
-
-/****
- **** 384-bit Primitives
- ****/
-
-static really_inline m384 and384(m384 a, m384 b) {
-    m384 rv;
-    rv.lo = and128(a.lo, b.lo);
-    rv.mid = and128(a.mid, b.mid);
-    rv.hi = and128(a.hi, b.hi);
-    return rv;
-}
-
-static really_inline m384 or384(m384 a, m384 b) {
-    m384 rv;
-    rv.lo = or128(a.lo, b.lo);
-    rv.mid = or128(a.mid, b.mid);
-    rv.hi = or128(a.hi, b.hi);
-    return rv;
-}
-
-static really_inline m384 xor384(m384 a, m384 b) {
-    m384 rv;
-    rv.lo = xor128(a.lo, b.lo);
-    rv.mid = xor128(a.mid, b.mid);
-    rv.hi = xor128(a.hi, b.hi);
-    return rv;
-}
-static really_inline m384 not384(m384 a) {
-    m384 rv;
-    rv.lo = not128(a.lo);
-    rv.mid = not128(a.mid);
-    rv.hi = not128(a.hi);
-    return rv;
-}
-static really_inline m384 andnot384(m384 a, m384 b) {
-    m384 rv;
-    rv.lo = andnot128(a.lo, b.lo);
-    rv.mid = andnot128(a.mid, b.mid);
-    rv.hi = andnot128(a.hi, b.hi);
-    return rv;
-}
-
-static really_really_inline
-m384 lshift64_m384(m384 a, unsigned b) {
-    m384 rv;
-    rv.lo = lshift64_m128(a.lo, b);
-    rv.mid = lshift64_m128(a.mid, b);
-    rv.hi = lshift64_m128(a.hi, b);
-    return rv;
-}
-
-static really_inline m384 zeroes384(void) {
-    m384 rv = {zeroes128(), zeroes128(), zeroes128()};
-    return rv;
-}
-
-static really_inline m384 ones384(void) {
-    m384 rv = {ones128(), ones128(), ones128()};
-    return rv;
-}
-
-static really_inline int diff384(m384 a, m384 b) {
-    return diff128(a.lo, b.lo) || diff128(a.mid, b.mid) || diff128(a.hi, b.hi);
-}
-
-static really_inline int isnonzero384(m384 a) {
-    return isnonzero128(or128(or128(a.lo, a.mid), a.hi));
-}
-
-/**
- * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit
- * mask indicating which 32-bit words contain differences.
- */
-static really_inline u32 diffrich384(m384 a, m384 b) {
-    m128 z = zeroes128();
-    a.lo = _mm_cmpeq_epi32(a.lo, b.lo);
-    a.mid = _mm_cmpeq_epi32(a.mid, b.mid);
-    a.hi = _mm_cmpeq_epi32(a.hi, b.hi);
-    m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.mid),
-                                  _mm_packs_epi32(a.hi, z));
-    return ~(_mm_movemask_epi8(packed)) & 0xfff;
-}
-
-/**
- * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and
- * returns a 12-bit mask indicating which 64-bit words contain differences.
- */
-static really_inline u32 diffrich64_384(m384 a, m384 b) {
-    u32 d = diffrich384(a, b);
-    return (d | (d >> 1)) & 0x55555555;
-}
-
-// aligned load
-static really_inline m384 load384(const void *ptr) {
-    assert(ISALIGNED_16(ptr));
-    m384 rv = { load128(ptr), load128((const char *)ptr + 16),
-                load128((const char *)ptr + 32) };
-    return rv;
-}
-
-// aligned store
-static really_inline void store384(void *ptr, m384 a) {
-    assert(ISALIGNED_16(ptr));
-    ptr = assume_aligned(ptr, 16);
-    *(m384 *)ptr = a;
-}
-
-// unaligned load
-static really_inline m384 loadu384(const void *ptr) {
-    m384 rv = { loadu128(ptr), loadu128((const char *)ptr + 16),
-                loadu128((const char *)ptr + 32)};
-    return rv;
-}
-
-// packed unaligned store of first N bytes
-static really_inline
-void storebytes384(void *ptr, m384 a, unsigned int n) {
-    assert(n <= sizeof(a));
-    memcpy(ptr, &a, n);
-}
-
-// packed unaligned load of first N bytes, pad with zero
-static really_inline
-m384 loadbytes384(const void *ptr, unsigned int n) {
-    m384 a = zeroes384();
-    assert(n <= sizeof(a));
-    memcpy(&a, ptr, n);
-    return a;
-}
-
-// switches on bit N in the given vector.
-static really_inline
-void setbit384(m384 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-    m128 *sub;
-    if (n < 128) {
-        sub = &ptr->lo;
-    } else if (n < 256) {
-        sub = &ptr->mid;
-    } else {
-        sub = &ptr->hi;
-    }
-    setbit128(sub, n % 128);
-}
-
-// switches off bit N in the given vector.
-static really_inline
-void clearbit384(m384 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-    m128 *sub;
-    if (n < 128) {
-        sub = &ptr->lo;
-    } else if (n < 256) {
-        sub = &ptr->mid;
-    } else {
-        sub = &ptr->hi;
-    }
-    clearbit128(sub, n % 128);
-}
-
-// tests bit N in the given vector.
-static really_inline
-char testbit384(m384 val, unsigned int n) {
-    assert(n < sizeof(val) * 8);
-    m128 sub;
-    if (n < 128) {
-        sub = val.lo;
-    } else if (n < 256) {
-        sub = val.mid;
-    } else {
-        sub = val.hi;
-    }
-    return testbit128(sub, n % 128);
-}
-
-/****
- **** 512-bit Primitives
- ****/
-
-#define eq512mask(a, b) _mm512_cmpeq_epi8_mask((a), (b))
-#define masked_eq512mask(k, a, b) _mm512_mask_cmpeq_epi8_mask((k), (a), (b))
-
-static really_inline
-m512 zeroes512(void) {
-#if defined(HAVE_AVX512)
-    return _mm512_setzero_si512();
-#else
-    m512 rv = {zeroes256(), zeroes256()};
-    return rv;
-#endif
-}
-
-static really_inline
-m512 ones512(void) {
-#if defined(HAVE_AVX512)
-    return _mm512_set1_epi8(0xFF);
-    //return _mm512_xor_si512(_mm512_setzero_si512(), _mm512_setzero_si512());
-#else
-    m512 rv = {ones256(), ones256()};
-    return rv;
-#endif
-}
-
-#if defined(HAVE_AVX512)
-static really_inline
-m512 set64x8(u8 a) {
-    return _mm512_set1_epi8(a);
-}
-
-static really_inline
-m512 set8x64(u64a a) {
-    return _mm512_set1_epi64(a);
-}
-
-static really_inline
-m512 set512_64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0,
-               u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) {
-    return _mm512_set_epi64(hi_3, hi_2, hi_1, hi_0,
-                            lo_3, lo_2, lo_1, lo_0);
-}
-
-static really_inline
-m512 swap256in512(m512 a) {
-    m512 idx = set512_64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL);
-    return vpermq512(idx, a);
-}
-
-static really_inline
-m512 set4x128(m128 a) {
-    return _mm512_broadcast_i32x4(a);
-}
-#endif
-
-static really_inline
-m512 and512(m512 a, m512 b) {
-#if defined(HAVE_AVX512)
-    return _mm512_and_si512(a, b);
-#else
-    m512 rv;
-    rv.lo = and256(a.lo, b.lo);
-    rv.hi = and256(a.hi, b.hi);
-    return rv;
-#endif
-}
-
-static really_inline
-m512 or512(m512 a, m512 b) {
-#if defined(HAVE_AVX512)
-    return _mm512_or_si512(a, b);
-#else
-    m512 rv;
-    rv.lo = or256(a.lo, b.lo);
-    rv.hi = or256(a.hi, b.hi);
-    return rv;
-#endif
-}
-
-static really_inline
-m512 xor512(m512 a, m512 b) {
-#if defined(HAVE_AVX512)
-    return _mm512_xor_si512(a, b);
-#else
-    m512 rv;
-    rv.lo = xor256(a.lo, b.lo);
-    rv.hi = xor256(a.hi, b.hi);
-    return rv;
-#endif
-}
-
-static really_inline
-m512 not512(m512 a) {
-#if defined(HAVE_AVX512)
-    return _mm512_xor_si512(a, ones512());
-#else
-    m512 rv;
-    rv.lo = not256(a.lo);
-    rv.hi = not256(a.hi);
-    return rv;
-#endif
-}
-
-static really_inline
-m512 andnot512(m512 a, m512 b) {
-#if defined(HAVE_AVX512)
-    return _mm512_andnot_si512(a, b);
-#else
-    m512 rv;
-    rv.lo = andnot256(a.lo, b.lo);
-    rv.hi = andnot256(a.hi, b.hi);
-    return rv;
-#endif
-}
-
-#if defined(HAVE_AVX512)
-static really_really_inline
-m512 lshift64_m512(m512 a, unsigned b) {
-#if defined(HAVE__BUILTIN_CONSTANT_P)
-    if (__builtin_constant_p(b)) {
-        return _mm512_slli_epi64(a, b);
-    }
-#endif
-    m128 x = _mm_cvtsi32_si128(b);
-    return _mm512_sll_epi64(a, x);
-}
-#else
-static really_really_inline
-m512 lshift64_m512(m512 a, unsigned b) {
-    m512 rv;
-    rv.lo = lshift64_m256(a.lo, b);
-    rv.hi = lshift64_m256(a.hi, b);
-    return rv;
-}
-#endif
-
-#if defined(HAVE_AVX512)
-#define rshift64_m512(a, b) _mm512_srli_epi64((a), (b))
-#define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed)
-#define lshift128_m512(a, count_immed) _mm512_bslli_epi128(a, count_immed)
-#endif
-
-#if !defined(_MM_CMPINT_NE)
-#define _MM_CMPINT_NE 0x4
-#endif
-
-static really_inline
-int diff512(m512 a, m512 b) {
-#if defined(HAVE_AVX512)
-    return !!_mm512_cmp_epi8_mask(a, b, _MM_CMPINT_NE);
-#else
-    return diff256(a.lo, b.lo) || diff256(a.hi, b.hi);
-#endif
-}
-
-static really_inline
-int isnonzero512(m512 a) {
-#if defined(HAVE_AVX512)
-    return diff512(a, zeroes512());
-#elif defined(HAVE_AVX2)
-    m256 x = or256(a.lo, a.hi);
-    return !!diff256(x, zeroes256());
-#else
-    m128 x = or128(a.lo.lo, a.lo.hi);
-    m128 y = or128(a.hi.lo, a.hi.hi);
-    return isnonzero128(or128(x, y));
-#endif
-}
-
-/**
- * "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit
- * mask indicating which 32-bit words contain differences.
- */
-static really_inline
-u32 diffrich512(m512 a, m512 b) {
-#if defined(HAVE_AVX512)
-    return _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_NE);
-#elif defined(HAVE_AVX2)
-    return diffrich256(a.lo, b.lo) | (diffrich256(a.hi, b.hi) << 8);
-#else
-    a.lo.lo = _mm_cmpeq_epi32(a.lo.lo, b.lo.lo);
-    a.lo.hi = _mm_cmpeq_epi32(a.lo.hi, b.lo.hi);
-    a.hi.lo = _mm_cmpeq_epi32(a.hi.lo, b.hi.lo);
-    a.hi.hi = _mm_cmpeq_epi32(a.hi.hi, b.hi.hi);
-    m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo.lo, a.lo.hi),
-                                  _mm_packs_epi32(a.hi.lo, a.hi.hi));
-    return ~(_mm_movemask_epi8(packed)) & 0xffff;
-#endif
-}
-
-/**
- * "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and
- * returns a 16-bit mask indicating which 64-bit words contain differences.
- */
-static really_inline
-u32 diffrich64_512(m512 a, m512 b) {
-    //TODO: cmp_epi64?
-    u32 d = diffrich512(a, b);
-    return (d | (d >> 1)) & 0x55555555;
-}
-
-// aligned load
-static really_inline
-m512 load512(const void *ptr) {
-#if defined(HAVE_AVX512)
-    return _mm512_load_si512(ptr);
-#else
-    assert(ISALIGNED_N(ptr, alignof(m256)));
-    m512 rv = { load256(ptr), load256((const char *)ptr + 32) };
-    return rv;
-#endif
-}
-
-// aligned store
-static really_inline
-void store512(void *ptr, m512 a) {
-    assert(ISALIGNED_N(ptr, alignof(m512)));
-#if defined(HAVE_AVX512)
-    return _mm512_store_si512(ptr, a);
-#elif defined(HAVE_AVX2)
-    m512 *x = (m512 *)ptr;
-    store256(&x->lo, a.lo);
-    store256(&x->hi, a.hi);
-#else
-    ptr = assume_aligned(ptr, 16);
-    *(m512 *)ptr = a;
-#endif
-}
-
-// unaligned load
-static really_inline
-m512 loadu512(const void *ptr) {
-#if defined(HAVE_AVX512)
-    return _mm512_loadu_si512(ptr);
-#else
-    m512 rv = { loadu256(ptr), loadu256((const char *)ptr + 32) };
-    return rv;
-#endif
-}
-
-#if defined(HAVE_AVX512)
-static really_inline
-m512 loadu_maskz_m512(__mmask64 k, const void *ptr) {
-    return _mm512_maskz_loadu_epi8(k, ptr);
-}
-
-static really_inline
-m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) {
-    return _mm512_mask_loadu_epi8(src, k, ptr);
-}
-
-static really_inline
-m512 set_mask_m512(__mmask64 k) {
-    return _mm512_movm_epi8(k);
-}
-#endif
-
-// packed unaligned store of first N bytes
-static really_inline
-void storebytes512(void *ptr, m512 a, unsigned int n) {
-    assert(n <= sizeof(a));
-    memcpy(ptr, &a, n);
-}
-
-// packed unaligned load of first N bytes, pad with zero
-static really_inline
-m512 loadbytes512(const void *ptr, unsigned int n) {
-    m512 a = zeroes512();
-    assert(n <= sizeof(a));
-    memcpy(&a, ptr, n);
-    return a;
-}
-
-static really_inline
-m512 mask1bit512(unsigned int n) {
-    assert(n < sizeof(m512) * 8);
-    u32 mask_idx = ((n % 8) * 64) + 95;
-    mask_idx -= n / 8;
-    return loadu512(&simd_onebit_masks[mask_idx]);
-}
-
-// switches on bit N in the given vector.
-static really_inline
-void setbit512(m512 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-#if !defined(HAVE_AVX2)
-    m128 *sub;
-    if (n < 128) {
-        sub = &ptr->lo.lo;
-    } else if (n < 256) {
-        sub = &ptr->lo.hi;
-    } else if (n < 384) {
-        sub = &ptr->hi.lo;
-    } else {
-        sub = &ptr->hi.hi;
-    }
-    setbit128(sub, n % 128);
-#elif defined(HAVE_AVX512)
-    *ptr = or512(mask1bit512(n), *ptr);
-#else
-    m256 *sub;
-    if (n < 256) {
-        sub = &ptr->lo;
-    } else {
-        sub = &ptr->hi;
-        n -= 256;
-    }
-    setbit256(sub, n);
-#endif
-}
-
-// switches off bit N in the given vector.
-static really_inline
-void clearbit512(m512 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-#if !defined(HAVE_AVX2)
-    m128 *sub;
-    if (n < 128) {
-        sub = &ptr->lo.lo;
-    } else if (n < 256) {
-        sub = &ptr->lo.hi;
-    } else if (n < 384) {
-        sub = &ptr->hi.lo;
-    } else {
-        sub = &ptr->hi.hi;
-    }
-    clearbit128(sub, n % 128);
-#elif defined(HAVE_AVX512)
-    *ptr = andnot512(mask1bit512(n), *ptr);
-#else
-    m256 *sub;
-    if (n < 256) {
-        sub = &ptr->lo;
-    } else {
-        sub = &ptr->hi;
-        n -= 256;
-    }
-    clearbit256(sub, n);
-#endif
-}
-
-// tests bit N in the given vector.
-static really_inline
-char testbit512(m512 val, unsigned int n) {
-    assert(n < sizeof(val) * 8);
-#if !defined(HAVE_AVX2)
-    m128 sub;
-    if (n < 128) {
-        sub = val.lo.lo;
-    } else if (n < 256) {
-        sub = val.lo.hi;
-    } else if (n < 384) {
-        sub = val.hi.lo;
-    } else {
-        sub = val.hi.hi;
-    }
-    return testbit128(sub, n % 128);
-#elif defined(HAVE_AVX512)
-    const m512 mask = mask1bit512(n);
-    return !!_mm512_test_epi8_mask(mask, val);
-#else
-    m256 sub;
-    if (n < 256) {
-        sub = val.lo;
-    } else {
-        sub = val.hi;
-        n -= 256;
-    }
-    return testbit256(sub, n);
-#endif
-}
-
-#endif
+#endif // SIMD_UTILS_H

From f7a6b8934cddbdfd77f1eb565b7ba08f9aa6a5f6 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <markos@freevec.org>
Date: Wed, 23 Sep 2020 11:49:26 +0300
Subject: [PATCH 11/53] add some set*() functions, harmonize names, rename
 setAxB to set1_AxB when using mm_set1_* internally

---
 src/util/arch/x86/simd_utils.h | 73 +++++++++++++++++++++++-----------
 1 file changed, 49 insertions(+), 24 deletions(-)

diff --git a/src/util/arch/x86/simd_utils.h b/src/util/arch/x86/simd_utils.h
index 6ec4042b..2d099f56 100644
--- a/src/util/arch/x86/simd_utils.h
+++ b/src/util/arch/x86/simd_utils.h
@@ -111,14 +111,18 @@ m128 lshift64_m128(m128 a, unsigned b) {
 #define eq128(a, b)      _mm_cmpeq_epi8((a), (b))
 #define movemask128(a)  ((u32)_mm_movemask_epi8((a)))
 
-static really_inline m128 set16x8(u8 c) {
+static really_inline m128 set1_16x8(u8 c) {
     return _mm_set1_epi8(c);
 }
 
-static really_inline m128 set4x32(u32 c) {
+static really_inline m128 set1_4x32(u32 c) {
     return _mm_set1_epi32(c);
 }
 
+static really_inline m128 set1_2x64(u64a c) {
+    return _mm_set1_epi64x(c);
+}
+
 static really_inline u32 movd(const m128 in) {
     return _mm_cvtsi128_si32(in);
 }
@@ -335,7 +339,12 @@ m128 sub_u8_m128(m128 a, m128 b) {
 }
 
 static really_inline
-m128 set64x2(u64a hi, u64a lo) {
+m128 set4x32(u32 x3, u32 x2, u32 x1, u32 x0) {
+    return _mm_set_epi32(x3, x2, x1, x0);
+}
+
+static really_inline
+m128 set2x64(u64a hi, u64a lo) {
     return _mm_set_epi64x(hi, lo);
 }
 
@@ -358,16 +367,15 @@ m256 lshift64_m256(m256 a, unsigned b) {
 
 #define rshift64_m256(a, b) _mm256_srli_epi64((a), (b))
 
-static really_inline
-m256 set32x8(u32 in) {
-    return _mm256_set1_epi8(in);
+static really_inline m256 set1_4x64(u64a c) {
+    return _mm256_set1_epi64x(c);
 }
 
 #define eq256(a, b)     _mm256_cmpeq_epi8((a), (b))
 #define movemask256(a)  ((u32)_mm256_movemask_epi8((a)))
 
 static really_inline
-m256 set2x128(m128 a) {
+m256 set1_2x128(m128 a) {
     return _mm256_broadcastsi128_si256(a);
 }
 
@@ -388,13 +396,6 @@ m256 rshift64_m256(m256 a, int b) {
     rv.hi = rshift64_m128(rv.hi, b);
     return rv;
 }
-static really_inline
-m256 set32x8(u32 in) {
-    m256 rv;
-    rv.lo = set16x8((u8) in);
-    rv.hi = rv.lo;
-    return rv;
-}
 
 static really_inline
 m256 eq256(m256 a, m256 b) {
@@ -412,7 +413,7 @@ u32 movemask256(m256 a) {
 }
 
 static really_inline
-m256 set2x128(m128 a) {
+m256 set1_2x128(m128 a) {
     m256 rv = {a, a};
     return rv;
 }
@@ -557,7 +558,7 @@ static really_inline m256 load256(const void *ptr) {
 // aligned load  of 128-bit value to low and high part of 256-bit value
 static really_inline m256 load2x128(const void *ptr) {
 #if defined(HAVE_AVX2)
-    return set2x128(load128(ptr));
+    return set1_2x128(load128(ptr));
 #else
     assert(ISALIGNED_N(ptr, alignof(m128)));
     m256 rv;
@@ -567,7 +568,7 @@ static really_inline m256 load2x128(const void *ptr) {
 }
 
 static really_inline m256 loadu2x128(const void *ptr) {
-    return set2x128(loadu128(ptr));
+    return set1_2x128(loadu128(ptr));
 }
 
 // aligned store
@@ -626,13 +627,37 @@ m256 mask1bit256(unsigned int n) {
 }
 
 static really_inline
-m256 set64x4(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) {
+m256 set1_32x8(u32 in) {
+#if defined(HAVE_AVX2)
+    return _mm256_set1_epi8(in);
+#else
+    m256 rv;
+    rv.hi = set1_16x8(in);
+    rv.lo = set1_16x8(in);
+    return rv;
+#endif
+}
+
+static really_inline
+m256 set8x32(u32 hi_3, u32 hi_2, u32 hi_1, u32 hi_0, u32 lo_3, u32 lo_2, u32 lo_1, u32 lo_0) {
+#if defined(HAVE_AVX2)
+    return _mm256_set_epi32(hi_3, hi_2, hi_1, hi_0, lo_3, lo_2, lo_1, lo_0);
+#else
+    m256 rv;
+    rv.hi = set4x32(hi_3, hi_2, hi_1, hi_0);
+    rv.lo = set4x32(lo_3, lo_2, lo_1, lo_0);
+    return rv;
+#endif
+}
+
+static really_inline
+m256 set4x64(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) {
 #if defined(HAVE_AVX2)
     return _mm256_set_epi64x(hi_1, hi_0, lo_1, lo_0);
 #else
     m256 rv;
-    rv.hi = set64x2(hi_1, hi_0);
-    rv.lo = set64x2(lo_1, lo_0);
+    rv.hi = set2x64(hi_1, hi_0);
+    rv.lo = set2x64(lo_1, lo_0);
     return rv;
 #endif
 }
@@ -964,17 +989,17 @@ m512 ones512(void) {
 
 #if defined(HAVE_AVX512)
 static really_inline
-m512 set64x8(u8 a) {
+m512 set1_64x8(u8 a) {
     return _mm512_set1_epi8(a);
 }
 
 static really_inline
-m512 set8x64(u64a a) {
+m512 set1_8x64(u64a a) {
     return _mm512_set1_epi64(a);
 }
 
 static really_inline
-m512 set512_64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0,
+m512 set8x64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0,
                u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) {
     return _mm512_set_epi64(hi_3, hi_2, hi_1, hi_0,
                             lo_3, lo_2, lo_1, lo_0);
@@ -987,7 +1012,7 @@ m512 swap256in512(m512 a) {
 }
 
 static really_inline
-m512 set4x128(m128 a) {
+m512 set1_4x128(m128 a) {
     return _mm512_broadcast_i32x4(a);
 }
 #endif

From 53334672495387c4575ca88834d5f5ee2ae726f6 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <markos@freevec.org>
Date: Wed, 23 Sep 2020 11:51:21 +0300
Subject: [PATCH 12/53] fix names, use own intrinsic instead of explicit _mm*
 ones

---
 src/fdr/teddy.c                 | 64 +++++++++++++++---------------
 src/fdr/teddy_avx2.c            | 12 +++---
 src/hwlm/noodle_engine_avx2.c   |  4 +-
 src/hwlm/noodle_engine_sse.c    |  4 +-
 src/nfa/mcclellan_common_impl.h |  2 +-
 src/nfa/mcsheng.c               |  8 ++--
 src/nfa/sheng_impl.h            |  2 +-
 src/nfa/sheng_impl4.h           |  2 +-
 src/nfa/shufti.c                | 30 +++++++-------
 src/nfa/truffle.c               | 16 ++++----
 src/nfa/vermicelli_sse.h        | 20 +++++-----
 src/rose/counting_miracle.h     |  4 +-
 src/rose/program_runtime.c      | 20 +++++-----
 src/rose/validate_shufti.h      | 16 ++++----
 src/util/state_compress.c       | 70 ++++++++++++++++-----------------
 15 files changed, 137 insertions(+), 137 deletions(-)

diff --git a/src/fdr/teddy.c b/src/fdr/teddy.c
index 960e2a41..97cff0b4 100644
--- a/src/fdr/teddy.c
+++ b/src/fdr/teddy.c
@@ -311,26 +311,26 @@ const u8 ALIGN_DIRECTIVE p_sh_mask_arr[80] = {
     sl_msk[2] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL3_POS);
 
 #define PREPARE_MASKS_1                                                       \
-    dup_mask[0] = set4x128(maskBase[0]);                                      \
-    dup_mask[1] = set4x128(maskBase[1]);
+    dup_mask[0] = set1_4x128(maskBase[0]);                                      \
+    dup_mask[1] = set1_4x128(maskBase[1]);
 
 #define PREPARE_MASKS_2                                                       \
     PREPARE_MASKS_1                                                           \
-    dup_mask[2] = set4x128(maskBase[2]);                                      \
-    dup_mask[3] = set4x128(maskBase[3]);
+    dup_mask[2] = set1_4x128(maskBase[2]);                                      \
+    dup_mask[3] = set1_4x128(maskBase[3]);
 
 #define PREPARE_MASKS_3                                                       \
     PREPARE_MASKS_2                                                           \
-    dup_mask[4] = set4x128(maskBase[4]);                                      \
-    dup_mask[5] = set4x128(maskBase[5]);
+    dup_mask[4] = set1_4x128(maskBase[4]);                                      \
+    dup_mask[5] = set1_4x128(maskBase[5]);
 
 #define PREPARE_MASKS_4                                                       \
     PREPARE_MASKS_3                                                           \
-    dup_mask[6] = set4x128(maskBase[6]);                                      \
-    dup_mask[7] = set4x128(maskBase[7]);
+    dup_mask[6] = set1_4x128(maskBase[6]);                                      \
+    dup_mask[7] = set1_4x128(maskBase[7]);
 
 #define PREPARE_MASKS(n)                                                      \
-    m512 lo_mask = set64x8(0xf);                                              \
+    m512 lo_mask = set1_64x8(0xf);                                              \
     m512 dup_mask[n * 2];                                                     \
     m512 sl_msk[n - 1];                                                       \
     PREPARE_MASKS_##n                                                         \
@@ -570,26 +570,26 @@ m512 prep_conf_teddy_m4(const m512 *lo_mask, const m512 *dup_mask,
                          &c_0, &c_16, &c_32, &c_48)
 
 #define PREPARE_MASKS_1                                                       \
-    dup_mask[0] = set4x128(maskBase[0]);                                      \
-    dup_mask[1] = set4x128(maskBase[1]);
+    dup_mask[0] = set1_4x128(maskBase[0]);                                      \
+    dup_mask[1] = set1_4x128(maskBase[1]);
 
 #define PREPARE_MASKS_2                                                       \
     PREPARE_MASKS_1                                                           \
-    dup_mask[2] = set4x128(maskBase[2]);                                      \
-    dup_mask[3] = set4x128(maskBase[3]);
+    dup_mask[2] = set1_4x128(maskBase[2]);                                      \
+    dup_mask[3] = set1_4x128(maskBase[3]);
 
 #define PREPARE_MASKS_3                                                       \
     PREPARE_MASKS_2                                                           \
-    dup_mask[4] = set4x128(maskBase[4]);                                      \
-    dup_mask[5] = set4x128(maskBase[5]);
+    dup_mask[4] = set1_4x128(maskBase[4]);                                      \
+    dup_mask[5] = set1_4x128(maskBase[5]);
 
 #define PREPARE_MASKS_4                                                       \
     PREPARE_MASKS_3                                                           \
-    dup_mask[6] = set4x128(maskBase[6]);                                      \
-    dup_mask[7] = set4x128(maskBase[7]);
+    dup_mask[6] = set1_4x128(maskBase[6]);                                      \
+    dup_mask[7] = set1_4x128(maskBase[7]);
 
 #define PREPARE_MASKS(n)                                                      \
-    m512 lo_mask = set64x8(0xf);                                              \
+    m512 lo_mask = set1_64x8(0xf);                                              \
     m512 dup_mask[n * 2];                                                     \
     PREPARE_MASKS_##n
 
@@ -713,7 +713,7 @@ do {                                                                        \
 #define PREP_SHUF_MASK                                                      \
     PREP_SHUF_MASK_NO_REINFORCEMENT(load256(ptr));                          \
     *c_128 = *(ptr + 15);                                                   \
-    m256 r_msk = set64x4(0ULL, r_msk_base[*c_128], 0ULL, r_msk_base[*c_0]); \
+    m256 r_msk = set4x64(0ULL, r_msk_base[*c_128], 0ULL, r_msk_base[*c_0]); \
     *c_0 = *(ptr + 31)
 
 #define SHIFT_OR_M1                                                         \
@@ -805,26 +805,26 @@ m256 prep_conf_teddy_m4(const m256 *lo_mask, const m256 *dup_mask,
     prep_conf_teddy_m##n(&lo_mask, dup_mask, ptr, r_msk_base, &c_0, &c_128)
 
 #define PREPARE_MASKS_1                                                       \
-    dup_mask[0] = set2x128(maskBase[0]);                                      \
-    dup_mask[1] = set2x128(maskBase[1]);
+    dup_mask[0] = set1_2x128(maskBase[0]);                                      \
+    dup_mask[1] = set1_2x128(maskBase[1]);
 
 #define PREPARE_MASKS_2                                                       \
     PREPARE_MASKS_1                                                           \
-    dup_mask[2] = set2x128(maskBase[2]);                                      \
-    dup_mask[3] = set2x128(maskBase[3]);
+    dup_mask[2] = set1_2x128(maskBase[2]);                                      \
+    dup_mask[3] = set1_2x128(maskBase[3]);
 
 #define PREPARE_MASKS_3                                                       \
     PREPARE_MASKS_2                                                           \
-    dup_mask[4] = set2x128(maskBase[4]);                                      \
-    dup_mask[5] = set2x128(maskBase[5]);
+    dup_mask[4] = set1_2x128(maskBase[4]);                                      \
+    dup_mask[5] = set1_2x128(maskBase[5]);
 
 #define PREPARE_MASKS_4                                                       \
     PREPARE_MASKS_3                                                           \
-    dup_mask[6] = set2x128(maskBase[6]);                                      \
-    dup_mask[7] = set2x128(maskBase[7]);
+    dup_mask[6] = set1_2x128(maskBase[6]);                                      \
+    dup_mask[7] = set1_2x128(maskBase[7]);
 
 #define PREPARE_MASKS(n)                                                      \
-    m256 lo_mask = set32x8(0xf);                                              \
+    m256 lo_mask = set1_32x8(0xf);                                              \
     m256 dup_mask[n * 2];                                                     \
     PREPARE_MASKS_##n
 
@@ -925,7 +925,7 @@ do {                                                                        \
 
 static really_inline
 m128 prep_conf_teddy_m1(const m128 *maskBase, m128 val) {
-    m128 mask = set16x8(0xf);
+    m128 mask = set1_16x8(0xf);
     m128 lo = and128(val, mask);
     m128 hi = and128(rshift64_m128(val, 4), mask);
     return or128(pshufb_m128(maskBase[0 * 2], lo),
@@ -934,7 +934,7 @@ m128 prep_conf_teddy_m1(const m128 *maskBase, m128 val) {
 
 static really_inline
 m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 val) {
-    m128 mask = set16x8(0xf);
+    m128 mask = set1_16x8(0xf);
     m128 lo = and128(val, mask);
     m128 hi = and128(rshift64_m128(val, 4), mask);
     m128 r = prep_conf_teddy_m1(maskBase, val);
@@ -949,7 +949,7 @@ m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 val) {
 static really_inline
 m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2,
                         m128 val) {
-    m128 mask = set16x8(0xf);
+    m128 mask = set1_16x8(0xf);
     m128 lo = and128(val, mask);
     m128 hi = and128(rshift64_m128(val, 4), mask);
     m128 r = prep_conf_teddy_m2(maskBase, old_1, val);
@@ -964,7 +964,7 @@ m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2,
 static really_inline
 m128 prep_conf_teddy_m4(const m128 *maskBase, m128 *old_1, m128 *old_2,
                         m128 *old_3, m128 val) {
-    m128 mask = set16x8(0xf);
+    m128 mask = set1_16x8(0xf);
     m128 lo = and128(val, mask);
     m128 hi = and128(rshift64_m128(val, 4), mask);
     m128 r = prep_conf_teddy_m3(maskBase, old_1, old_2, val);
diff --git a/src/fdr/teddy_avx2.c b/src/fdr/teddy_avx2.c
index 20ea938c..df54fc62 100644
--- a/src/fdr/teddy_avx2.c
+++ b/src/fdr/teddy_avx2.c
@@ -501,15 +501,15 @@ m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const size_t start_offset,
                        const u8 *buf_history, size_t len_history,
                        const u32 nMasks) {
     m128 p_mask128;
-    m256 ret = set2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi,
+    m256 ret = set1_2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi,
                                         buf_history, len_history, nMasks));
-    *p_mask = set2x128(p_mask128);
+    *p_mask = set1_2x128(p_mask128);
     return ret;
 }
 
 static really_inline
 m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 val) {
-    m256 mask = set32x8(0xf);
+    m256 mask = set1_32x8(0xf);
     m256 lo = and256(val, mask);
     m256 hi = and256(rshift64_m256(val, 4), mask);
     return or256(pshufb_m256(maskBase[0 * 2], lo),
@@ -518,7 +518,7 @@ m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 val) {
 
 static really_inline
 m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 val) {
-    m256 mask = set32x8(0xf);
+    m256 mask = set1_32x8(0xf);
     m256 lo = and256(val, mask);
     m256 hi = and256(rshift64_m256(val, 4), mask);
     m256 r = prep_conf_fat_teddy_m1(maskBase, val);
@@ -533,7 +533,7 @@ m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 val) {
 static really_inline
 m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2,
                             m256 val) {
-    m256 mask = set32x8(0xf);
+    m256 mask = set1_32x8(0xf);
     m256 lo = and256(val, mask);
     m256 hi = and256(rshift64_m256(val, 4), mask);
     m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, val);
@@ -548,7 +548,7 @@ m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2,
 static really_inline
 m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2,
                             m256 *old_3, m256 val) {
-    m256 mask = set32x8(0xf);
+    m256 mask = set1_32x8(0xf);
     m256 lo = and256(val, mask);
     m256 hi = and256(rshift64_m256(val, 4), mask);
     m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, val);
diff --git a/src/hwlm/noodle_engine_avx2.c b/src/hwlm/noodle_engine_avx2.c
index 5edc646a..49fe168f 100644
--- a/src/hwlm/noodle_engine_avx2.c
+++ b/src/hwlm/noodle_engine_avx2.c
@@ -30,11 +30,11 @@
 
 static really_inline m256 getMask(u8 c, bool noCase) {
     u8 k = caseClear8(c, noCase);
-    return set32x8(k);
+    return set1_32x8(k);
 }
 
 static really_inline m256 getCaseMask(void) {
-    return set32x8(0xdf);
+    return set1_32x8(0xdf);
 }
 
 static really_inline
diff --git a/src/hwlm/noodle_engine_sse.c b/src/hwlm/noodle_engine_sse.c
index 7cd53d7c..5d47768d 100644
--- a/src/hwlm/noodle_engine_sse.c
+++ b/src/hwlm/noodle_engine_sse.c
@@ -30,11 +30,11 @@
 
 static really_inline m128 getMask(u8 c, bool noCase) {
     u8 k = caseClear8(c, noCase);
-    return set16x8(k);
+    return set1_16x8(k);
 }
 
 static really_inline m128 getCaseMask(void) {
-    return set16x8(0xdf);
+    return set1_16x8(0xdf);
 }
 
 static really_inline
diff --git a/src/nfa/mcclellan_common_impl.h b/src/nfa/mcclellan_common_impl.h
index 7b0e7f48..6ec1b1f1 100644
--- a/src/nfa/mcclellan_common_impl.h
+++ b/src/nfa/mcclellan_common_impl.h
@@ -59,7 +59,7 @@ u32 doSherman16(const char *sherman_state, u8 cprime, const u16 *succ_table,
 
     if (len) {
         m128 ss_char = load128(sherman_state);
-        m128 cur_char = set16x8(cprime);
+        m128 cur_char = set1_16x8(cprime);
 
         u32 z = movemask128(eq128(ss_char, cur_char));
 
diff --git a/src/nfa/mcsheng.c b/src/nfa/mcsheng.c
index 4619ff6f..dd00617e 100644
--- a/src/nfa/mcsheng.c
+++ b/src/nfa/mcsheng.c
@@ -72,7 +72,7 @@ u32 doSherman16(const char *sherman_state, u8 cprime, const u16 *succ_table,
 
     if (len) {
         m128 ss_char = load128(sherman_state);
-        m128 cur_char = set16x8(cprime);
+        m128 cur_char = set1_16x8(cprime);
 
         u32 z = movemask128(eq128(ss_char, cur_char));
 
@@ -153,7 +153,7 @@ u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end,
     assert(s_in); /* should not already be dead */
     assert(soft_c_end <= hard_c_end);
     DEBUG_PRINTF("s_in = %u (adjusted %u)\n", s_in, s_in - 1);
-    m128 s = set16x8(s_in - 1);
+    m128 s = set1_16x8(s_in - 1);
     const u8 *c = *c_inout;
     const u8 *c_end = hard_c_end - SHENG_CHUNK + 1;
     if (!do_accel) {
@@ -171,8 +171,8 @@ u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end,
 
 #if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
     u32 sheng_limit_x4 = sheng_limit * 0x01010101;
-    m128 simd_stop_limit = set4x32(sheng_stop_limit_x4);
-    m128 accel_delta = set16x8(sheng_limit - sheng_stop_limit);
+    m128 simd_stop_limit = set1_4x32(sheng_stop_limit_x4);
+    m128 accel_delta = set1_16x8(sheng_limit - sheng_stop_limit);
     DEBUG_PRINTF("end %hhu, accel %hu --> limit %hhu\n", sheng_limit,
                  m->sheng_accel_limit, sheng_stop_limit);
 #endif
diff --git a/src/nfa/sheng_impl.h b/src/nfa/sheng_impl.h
index 9552fe15..aa416194 100644
--- a/src/nfa/sheng_impl.h
+++ b/src/nfa/sheng_impl.h
@@ -52,7 +52,7 @@ char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s,
     }
     DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start));
 
-    m128 cur_state = set16x8(*state);
+    m128 cur_state = set1_16x8(*state);
     const m128 *masks = s->shuffle_masks;
 
     while (likely(cur_buf != end)) {
diff --git a/src/nfa/sheng_impl4.h b/src/nfa/sheng_impl4.h
index 74032201..c51bcdea 100644
--- a/src/nfa/sheng_impl4.h
+++ b/src/nfa/sheng_impl4.h
@@ -86,7 +86,7 @@ char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s,
         return MO_CONTINUE_MATCHING;
     }
 
-    m128 cur_state = set16x8(*state);
+    m128 cur_state = set1_16x8(*state);
     const m128 *masks = s->shuffle_masks;
 
     while (likely(end - cur_buf >= 4)) {
diff --git a/src/nfa/shufti.c b/src/nfa/shufti.c
index 09ffc0cf..e76dcca8 100644
--- a/src/nfa/shufti.c
+++ b/src/nfa/shufti.c
@@ -159,7 +159,7 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
     }
 
     const m128 zeroes = zeroes128();
-    const m128 low4bits = _mm_set1_epi8(0xf);
+    const m128 low4bits = set1_16x8(0xf);
     const u8 *rv;
 
     size_t min = (size_t)buf % 16;
@@ -246,7 +246,7 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
     }
 
     const m128 zeroes = zeroes128();
-    const m128 low4bits = _mm_set1_epi8(0xf);
+    const m128 low4bits = set1_16x8(0xf);
     const u8 *rv;
 
     assert(buf_end - buf >= 16);
@@ -320,7 +320,7 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
                            m128 mask2_lo, m128 mask2_hi,
                            const u8 *buf, const u8 *buf_end) {
     const m128 ones = ones128();
-    const m128 low4bits = _mm_set1_epi8(0xf);
+    const m128 low4bits = set1_16x8(0xf);
     const u8 *rv;
 
     size_t min = (size_t)buf % 16;
@@ -455,15 +455,15 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
                              buf, buf_end);
     }
 
-    const m256 low4bits = set32x8(0xf);
+    const m256 low4bits = set1_32x8(0xf);
 
     if (buf_end - buf <= 32) {
         return shuftiFwdShort(mask_lo, mask_hi, buf, buf_end, low4bits);
     }
 
     const m256 zeroes = zeroes256();
-    const m256 wide_mask_lo = set2x128(mask_lo);
-    const m256 wide_mask_hi = set2x128(mask_hi);
+    const m256 wide_mask_lo = set1_2x128(mask_lo);
+    const m256 wide_mask_hi = set1_2x128(mask_hi);
     const u8 *rv;
 
     size_t min = (size_t)buf % 32;
@@ -579,15 +579,15 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
                              buf, buf_end);
     }
 
-    const m256 low4bits = set32x8(0xf);
+    const m256 low4bits = set1_32x8(0xf);
 
     if (buf_end - buf <= 32) {
         return shuftiRevShort(mask_lo, mask_hi, buf, buf_end, low4bits);
     }
 
     const m256 zeroes = zeroes256();
-    const m256 wide_mask_lo = set2x128(mask_lo);
-    const m256 wide_mask_hi = set2x128(mask_hi);
+    const m256 wide_mask_lo = set1_2x128(mask_lo);
+    const m256 wide_mask_hi = set1_2x128(mask_hi);
     const u8 *rv;
 
     assert(buf_end - buf >= 32);
@@ -676,7 +676,7 @@ static really_inline
 const u8 *shuftiDoubleShort(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo,
                             m128 mask2_hi, const u8 *buf, const u8 *buf_end) {
     DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf);
-    const m256 low4bits = set32x8(0xf);
+    const m256 low4bits = set1_32x8(0xf);
     // run shufti over two overlapping 16-byte unaligned reads
     const m256 mask1 = combine2x128(mask1_hi, mask1_lo);
     const m256 mask2 = combine2x128(mask2_hi, mask2_lo);
@@ -708,11 +708,11 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
     }
 
     const m256 ones = ones256();
-    const m256 low4bits = set32x8(0xf);
-    const m256 wide_mask1_lo = set2x128(mask1_lo);
-    const m256 wide_mask1_hi = set2x128(mask1_hi);
-    const m256 wide_mask2_lo = set2x128(mask2_lo);
-    const m256 wide_mask2_hi = set2x128(mask2_hi);
+    const m256 low4bits = set1_32x8(0xf);
+    const m256 wide_mask1_lo = set1_2x128(mask1_lo);
+    const m256 wide_mask1_hi = set1_2x128(mask1_hi);
+    const m256 wide_mask2_lo = set1_2x128(mask2_lo);
+    const m256 wide_mask2_hi = set1_2x128(mask2_hi);
     const u8 *rv;
 
     size_t min = (size_t)buf % 32;
diff --git a/src/nfa/truffle.c b/src/nfa/truffle.c
index be6b312c..37af13ad 100644
--- a/src/nfa/truffle.c
+++ b/src/nfa/truffle.c
@@ -64,8 +64,8 @@ const u8 *firstMatch(const u8 *buf, u32 z) {
 static really_inline
 u32 block(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, m128 v) {
 
-    m128 highconst = _mm_set1_epi8(0x80);
-    m128 shuf_mask_hi = _mm_set1_epi64x(0x8040201008040201);
+    m128 highconst = set1_16x8(0x80);
+    m128 shuf_mask_hi = set1_2x64(0x8040201008040201);
 
     // and now do the real work
     m128 shuf1 = pshufb_m128(shuf_mask_lo_highclear, v);
@@ -260,8 +260,8 @@ const u8 *firstMatch(const u8 *buf, u32 z) {
 static really_inline
 u32 block(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, m256 v) {
 
-    m256 highconst = _mm256_set1_epi8(0x80);
-    m256 shuf_mask_hi = _mm256_set1_epi64x(0x8040201008040201);
+    m256 highconst = set1_32x8(0x80);
+    m256 shuf_mask_hi = set1_4x64(0x8040201008040201);
 
     // and now do the real work
     m256 shuf1 = pshufb_m256(shuf_mask_lo_highclear, v);
@@ -315,8 +315,8 @@ const u8 *truffleExec(m128 shuf_mask_lo_highclear,
                       m128 shuf_mask_lo_highset,
                       const u8 *buf, const u8 *buf_end) {
     DEBUG_PRINTF("len %zu\n", buf_end - buf);
-    const m256 wide_clear = set2x128(shuf_mask_lo_highclear);
-    const m256 wide_set = set2x128(shuf_mask_lo_highset);
+    const m256 wide_clear = set1_2x128(shuf_mask_lo_highclear);
+    const m256 wide_set = set1_2x128(shuf_mask_lo_highset);
 
     assert(buf && buf_end);
     assert(buf < buf_end);
@@ -382,8 +382,8 @@ const u8 *truffleRevMini(m256 shuf_mask_lo_highclear,
 const u8 *rtruffleExec(m128 shuf_mask_lo_highclear,
                        m128 shuf_mask_lo_highset,
                        const u8 *buf, const u8 *buf_end) {
-    const m256 wide_clear = set2x128(shuf_mask_lo_highclear);
-    const m256 wide_set = set2x128(shuf_mask_lo_highset);
+    const m256 wide_clear = set1_2x128(shuf_mask_lo_highclear);
+    const m256 wide_set = set1_2x128(shuf_mask_lo_highset);
     assert(buf && buf_end);
     assert(buf < buf_end);
     const u8 *rv;
diff --git a/src/nfa/vermicelli_sse.h b/src/nfa/vermicelli_sse.h
index 3307486c..dc56a5f1 100644
--- a/src/nfa/vermicelli_sse.h
+++ b/src/nfa/vermicelli_sse.h
@@ -36,7 +36,7 @@
 
 #define VERM_BOUNDARY 16
 #define VERM_TYPE m128
-#define VERM_SET_FN set16x8
+#define VERM_SET_FN set1_16x8
 
 static really_inline
 const u8 *vermSearchAligned(m128 chars, const u8 *buf, const u8 *buf_end,
@@ -74,7 +74,7 @@ static really_inline
 const u8 *vermSearchAlignedNocase(m128 chars, const u8 *buf,
                                   const u8 *buf_end, char negate) {
     assert((size_t)buf % 16 == 0);
-    m128 casemask = set16x8(CASE_CLEAR);
+    m128 casemask = set1_16x8(CASE_CLEAR);
 
     for (; buf + 31 < buf_end; buf += 32) {
         m128 data = load128(buf);
@@ -122,7 +122,7 @@ const u8 *vermUnalign(m128 chars, const u8 *buf, char negate) {
 // returns NULL if not found
 static really_inline
 const u8 *vermUnalignNocase(m128 chars, const u8 *buf, char negate) {
-    m128 casemask = set16x8(CASE_CLEAR);
+    m128 casemask = set1_16x8(CASE_CLEAR);
     m128 data = loadu128(buf); // unaligned
     u32 z = movemask128(eq128(chars, and128(casemask, data)));
     if (negate) {
@@ -157,7 +157,7 @@ static really_inline
 const u8 *dvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2,
                                    const u8 *buf, const u8 *buf_end) {
     assert((size_t)buf % 16 == 0);
-    m128 casemask = set16x8(CASE_CLEAR);
+    m128 casemask = set1_16x8(CASE_CLEAR);
 
     for (; buf + 16 < buf_end; buf += 16) {
         m128 data = load128(buf);
@@ -219,7 +219,7 @@ const u8 *dvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) {
 static really_inline
 const u8 *dvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
     /* due to laziness, nonalphas and nocase having interesting behaviour */
-    m128 casemask = set16x8(CASE_CLEAR);
+    m128 casemask = set1_16x8(CASE_CLEAR);
     m128 data = loadu128(buf); // unaligned
     m128 v = and128(casemask, data);
     u32 z = movemask128(and128(eq128(chars1, v),
@@ -277,7 +277,7 @@ static really_inline
 const u8 *rvermSearchAlignedNocase(m128 chars, const u8 *buf,
                                    const u8 *buf_end, char negate) {
     assert((size_t)buf_end % 16 == 0);
-    m128 casemask = set16x8(CASE_CLEAR);
+    m128 casemask = set1_16x8(CASE_CLEAR);
 
     for (; buf + 15 < buf_end; buf_end -= 16) {
         m128 data = load128(buf_end - 16);
@@ -309,7 +309,7 @@ const u8 *rvermUnalign(m128 chars, const u8 *buf, char negate) {
 // returns NULL if not found
 static really_inline
 const u8 *rvermUnalignNocase(m128 chars, const u8 *buf, char negate) {
-    m128 casemask = set16x8(CASE_CLEAR);
+    m128 casemask = set1_16x8(CASE_CLEAR);
     m128 data = loadu128(buf); // unaligned
     u32 z = movemask128(eq128(chars, and128(casemask, data)));
     if (negate) {
@@ -344,7 +344,7 @@ static really_inline
 const u8 *rdvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2,
                                     const u8 *buf, const u8 *buf_end) {
     assert((size_t)buf_end % 16 == 0);
-    m128 casemask = set16x8(CASE_CLEAR);
+    m128 casemask = set1_16x8(CASE_CLEAR);
 
     for (; buf + 16 < buf_end; buf_end -= 16) {
         m128 data = load128(buf_end - 16);
@@ -381,7 +381,7 @@ const u8 *rdvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) {
 static really_inline
 const u8 *rdvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
     /* due to laziness, nonalphas and nocase having interesting behaviour */
-    m128 casemask = set16x8(CASE_CLEAR);
+    m128 casemask = set1_16x8(CASE_CLEAR);
     m128 data = loadu128(buf);
     m128 v = and128(casemask, data);
     u32 z = movemask128(and128(eq128(chars2, v),
@@ -398,7 +398,7 @@ const u8 *rdvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) {
 
 #define VERM_BOUNDARY 64
 #define VERM_TYPE m512
-#define VERM_SET_FN set64x8
+#define VERM_SET_FN set1_64x8
 
 static really_inline
 const u8 *vermMini(m512 chars, const u8 *buf, const u8 *buf_end, char negate) {
diff --git a/src/rose/counting_miracle.h b/src/rose/counting_miracle.h
index 976208b7..6210fca5 100644
--- a/src/rose/counting_miracle.h
+++ b/src/rose/counting_miracle.h
@@ -47,7 +47,7 @@ char roseCountingMiracleScan(u8 c, const u8 *d, const u8 *d_end,
 
     u32 count = *count_inout;
 
-    m128 chars = set16x8(c);
+    m128 chars = set1_16x8(c);
 
     for (; d + 16 <= d_end; d_end -= 16) {
         m128 data = loadu128(d_end - 16);
@@ -94,7 +94,7 @@ u32 roseCountingMiracleScanShufti(m128 mask_lo, m128 mask_hi, u8 poison,
     u32 count = *count_inout;
 
     const m128 zeroes = zeroes128();
-    const m128 low4bits = _mm_set1_epi8(0xf);
+    const m128 low4bits = set1_16x8(0xf);
 
     for (; d + 16 <= d_end; d_end -= 16) {
         m128 data = loadu128(d_end - 16);
diff --git a/src/rose/program_runtime.c b/src/rose/program_runtime.c
index 0f2d1083..d01e30e8 100644
--- a/src/rose/program_runtime.c
+++ b/src/rose/program_runtime.c
@@ -938,7 +938,7 @@ int roseCheckShufti16x16(const struct core_info *ci, const u8 *hi_mask,
         return 1;
     }
 
-    m256 data_m256 = set2x128(data);
+    m256 data_m256 = set1_2x128(data);
     m256 hi_mask_m256 = loadu256(hi_mask);
     m256 lo_mask_m256 = loadu256(lo_mask);
     m256 bucket_select_mask_m256 = loadu256(bucket_select_mask);
@@ -974,8 +974,8 @@ int roseCheckShufti32x8(const struct core_info *ci, const u8 *hi_mask,
 
     m128 hi_mask_m128 = loadu128(hi_mask);
     m128 lo_mask_m128 = loadu128(lo_mask);
-    m256 hi_mask_m256 = set2x128(hi_mask_m128);
-    m256 lo_mask_m256 = set2x128(lo_mask_m128);
+    m256 hi_mask_m256 = set1_2x128(hi_mask_m128);
+    m256 lo_mask_m256 = set1_2x128(lo_mask_m128);
     m256 bucket_select_mask_m256 = loadu256(bucket_select_mask);
     if (validateShuftiMask32x8(data, hi_mask_m256, lo_mask_m256,
                                bucket_select_mask_m256,
@@ -1287,7 +1287,7 @@ int roseCheckMultipathShufti16x8(const struct hs_scratch *scratch,
         u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask);
         DEBUG_PRINTF("expand_hi %llx\n", valid_hi);
         DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
-        expand_valid = set64x2(valid_hi, valid_lo);
+        expand_valid = set2x64(valid_hi, valid_lo);
         valid_path_mask = ~movemask128(pshufb_m128(expand_valid,
                                                data_select_mask));
     }
@@ -1332,7 +1332,7 @@ int roseCheckMultipathShufti32x8(const struct hs_scratch *scratch,
 
     u32 valid_data_mask;
     m128 data_m128 = getData128(ci, offset, &valid_data_mask);
-    m256 data_double = set2x128(data_m128);
+    m256 data_double = set1_2x128(data_m128);
     m256 data_select_mask = loadu256(ri->data_select_mask);
 
     u32 valid_path_mask = 0;
@@ -1346,7 +1346,7 @@ int roseCheckMultipathShufti32x8(const struct hs_scratch *scratch,
         u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask);
         DEBUG_PRINTF("expand_hi %llx\n", valid_hi);
         DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
-        expand_valid = set64x4(valid_hi, valid_lo, valid_hi,
+        expand_valid = set4x64(valid_hi, valid_lo, valid_hi,
                                          valid_lo);
         valid_path_mask = ~movemask256(pshufb_m256(expand_valid,
                                                   data_select_mask));
@@ -1393,7 +1393,7 @@ int roseCheckMultipathShufti32x16(const struct hs_scratch *scratch,
 
     u32 valid_data_mask;
     m128 data_m128 = getData128(ci, offset, &valid_data_mask);
-    m256 data_double = set2x128(data_m128);
+    m256 data_double = set1_2x128(data_m128);
     m256 data_select_mask = loadu256(ri->data_select_mask);
 
     u32 valid_path_mask = 0;
@@ -1407,7 +1407,7 @@ int roseCheckMultipathShufti32x16(const struct hs_scratch *scratch,
         u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask);
         DEBUG_PRINTF("expand_hi %llx\n", valid_hi);
         DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
-        expand_valid = set64x4(valid_hi, valid_lo, valid_hi,
+        expand_valid = set4x64(valid_hi, valid_lo, valid_hi,
                                          valid_lo);
         valid_path_mask = ~movemask256(pshufb_m256(expand_valid,
                                                    data_select_mask));
@@ -1460,7 +1460,7 @@ int roseCheckMultipathShufti64(const struct hs_scratch *scratch,
 
     u32 valid_data_mask;
     m128 data_m128 = getData128(ci, offset, &valid_data_mask);
-    m256 data_m256 = set2x128(data_m128);
+    m256 data_m256 = set1_2x128(data_m128);
     m256 data_select_mask_1 = loadu256(ri->data_select_mask);
     m256 data_select_mask_2 = loadu256(ri->data_select_mask + 32);
 
@@ -1475,7 +1475,7 @@ int roseCheckMultipathShufti64(const struct hs_scratch *scratch,
         u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask);
         DEBUG_PRINTF("expand_hi %llx\n", valid_hi);
         DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
-        expand_valid = set64x4(valid_hi, valid_lo, valid_hi,
+        expand_valid = set4x64(valid_hi, valid_lo, valid_hi,
                                          valid_lo);
         u32 valid_path_1 = movemask256(pshufb_m256(expand_valid,
                                                    data_select_mask_1));
diff --git a/src/rose/validate_shufti.h b/src/rose/validate_shufti.h
index 1dc855d9..3b91f091 100644
--- a/src/rose/validate_shufti.h
+++ b/src/rose/validate_shufti.h
@@ -47,7 +47,7 @@ static really_inline
 int validateShuftiMask16x16(const m256 data, const m256 hi_mask,
                             const m256 lo_mask, const m256 and_mask,
                             const u32 neg_mask, const u32 valid_data_mask) {
-    m256 low4bits = set32x8(0xf);
+    m256 low4bits = set1_32x8(0xf);
     m256 c_lo = pshufb_m256(lo_mask, and256(data, low4bits));
     m256 c_hi = pshufb_m256(hi_mask,
                             rshift64_m256(andnot256(low4bits, data), 4));
@@ -78,7 +78,7 @@ int validateShuftiMask16x8(const m128 data, const m256 nib_mask,
                            const m128 and_mask, const u32 neg_mask,
                            const u32 valid_data_mask) {
     m256 data_m256 = combine2x128(rshift64_m128(data, 4), data);
-    m256 low4bits = set32x8(0xf);
+    m256 low4bits = set1_32x8(0xf);
     m256 c_nib = pshufb_m256(nib_mask, and256(data_m256, low4bits));
     m128 t = and128(movdq_hi(c_nib), movdq_lo(c_nib));
     m128 nresult = eq128(and128(t, and_mask), zeroes128());
@@ -101,7 +101,7 @@ static really_inline
 int validateShuftiMask32x8(const m256 data, const m256 hi_mask,
                            const m256 lo_mask, const m256 and_mask,
                            const u32 neg_mask, const u32 valid_data_mask) {
-    m256 low4bits = set32x8(0xf);
+    m256 low4bits = set1_32x8(0xf);
     m256 c_lo = pshufb_m256(lo_mask, and256(data, low4bits));
     m256 c_hi = pshufb_m256(hi_mask,
                             rshift64_m256(andnot256(low4bits, data), 4));
@@ -133,7 +133,7 @@ int validateShuftiMask32x16(const m256 data,
                             const m256 bucket_mask_hi,
                             const m256 bucket_mask_lo, const u32 neg_mask,
                             const u32 valid_data_mask) {
-    m256 low4bits = set32x8(0xf);
+    m256 low4bits = set1_32x8(0xf);
     m256 data_lo = and256(data, low4bits);
     m256 data_hi = and256(rshift64_m256(data, 4), low4bits);
     m256 c_lo_1 = pshufb_m256(lo_mask_1, data_lo);
@@ -201,7 +201,7 @@ int validateMultipathShuftiMask16x8(const m128 data,
                                     const u32 neg_mask,
                                     const u32 valid_path_mask) {
     m256 data_256 = combine2x128(rshift64_m128(data, 4), data);
-    m256 low4bits = set32x8(0xf);
+    m256 low4bits = set1_32x8(0xf);
     m256 c_nib = pshufb_m256(nib_mask, and256(data_256, low4bits));
     m128 t = and128(movdq_hi(c_nib), movdq_lo(c_nib));
     m128 result = and128(t, bucket_select_mask);
@@ -220,7 +220,7 @@ int validateMultipathShuftiMask32x8(const m256 data,
                                     const u32 hi_bits, const u32 lo_bits,
                                     const u32 neg_mask,
                                     const u32 valid_path_mask) {
-    m256 low4bits = set32x8(0xf);
+    m256 low4bits = set1_32x8(0xf);
     m256 data_lo = and256(data, low4bits);
     m256 data_hi = and256(rshift64_m256(data, 4), low4bits);
     m256 c_lo = pshufb_m256(lo_mask, data_lo);
@@ -244,7 +244,7 @@ int validateMultipathShuftiMask32x16(const m256 data,
                                      const u32 hi_bits, const u32 lo_bits,
                                      const u32 neg_mask,
                                      const u32 valid_path_mask) {
-    m256 low4bits = set32x8(0xf);
+    m256 low4bits = set1_32x8(0xf);
     m256 data_lo = and256(data, low4bits);
     m256 data_hi = and256(rshift64_m256(data, 4), low4bits);
     m256 c_lo_1 = pshufb_m256(lo_mask_1, data_lo);
@@ -271,7 +271,7 @@ int validateMultipathShuftiMask64(const m256 data_1, const m256 data_2,
                                   const u64a hi_bits, const u64a lo_bits,
                                   const u64a neg_mask,
                                   const u64a valid_path_mask) {
-    m256 low4bits = set32x8(0xf);
+    m256 low4bits = set1_32x8(0xf);
     m256 c_lo_1 = pshufb_m256(lo_mask, and256(data_1, low4bits));
     m256 c_lo_2 = pshufb_m256(lo_mask, and256(data_2, low4bits));
     m256 c_hi_1 = pshufb_m256(hi_mask,
diff --git a/src/util/state_compress.c b/src/util/state_compress.c
index 7238849e..e6cf205c 100644
--- a/src/util/state_compress.c
+++ b/src/util/state_compress.c
@@ -150,7 +150,7 @@ m128 loadcompressed128_32bit(const void *ptr, m128 mvec) {
     u32 x[4] = { expand32(v[0], m[0]), expand32(v[1], m[1]),
                  expand32(v[2], m[2]), expand32(v[3], m[3]) };
 
-    return _mm_set_epi32(x[3], x[2], x[1], x[0]);
+    return set32x4(x[3], x[2], x[1], x[0]);
 }
 #endif
 
@@ -158,7 +158,7 @@ m128 loadcompressed128_32bit(const void *ptr, m128 mvec) {
 static really_inline
 m128 loadcompressed128_64bit(const void *ptr, m128 mvec) {
     // First, decompose our vectors into 64-bit chunks.
-    u64a m[2] = { movq(mvec), movq(_mm_srli_si128(mvec, 8)) };
+    u64a m[2] = { movq(mvec), movq(rshiftbyte_m128(mvec, 8)) };
 
     u32 bits[2] = { popcount64(m[0]), popcount64(m[1]) };
     u64a v[2];
@@ -167,7 +167,7 @@ m128 loadcompressed128_64bit(const void *ptr, m128 mvec) {
 
     u64a x[2] = { expand64(v[0], m[0]), expand64(v[1], m[1]) };
 
-    return _mm_set_epi64x(x[1], x[0]);
+    return set2x64(x[1], x[0]);
 }
 #endif
 
@@ -264,11 +264,11 @@ m256 loadcompressed256_32bit(const void *ptr, m256 mvec) {
                  expand32(v[6], m[6]), expand32(v[7], m[7]) };
 
 #if !defined(HAVE_AVX2)
-    m256 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]),
-                  .hi = _mm_set_epi32(x[7], x[6], x[5], x[4]) };
+    m256 xvec = { .lo = set32x4(x[3], x[2], x[1], x[0]),
+                  .hi = set32x4(x[7], x[6], x[5], x[4]) };
 #else
-    m256 xvec = _mm256_set_epi32(x[7], x[6], x[5], x[4],
-                                 x[3], x[2], x[1], x[0]);
+    m256 xvec = set32x8(x[7], x[6], x[5], x[4],
+                        x[3], x[2], x[1], x[0]);
 #endif
     return xvec;
 }
@@ -291,10 +291,10 @@ m256 loadcompressed256_64bit(const void *ptr, m256 mvec) {
                   expand64(v[2], m[2]), expand64(v[3], m[3]) };
 
 #if !defined(HAVE_AVX2)
-    m256 xvec = { .lo = _mm_set_epi64x(x[1], x[0]),
-                  .hi = _mm_set_epi64x(x[3], x[2]) };
+    m256 xvec = { .lo = set2x64(x[1], x[0]),
+                  .hi = set2x64(x[3], x[2]) };
 #else
-    m256 xvec = _mm256_set_epi64x(x[3], x[2], x[1], x[0]);
+    m256 xvec = set4x64(x[3], x[2], x[1], x[0]);
 #endif
     return xvec;
 }
@@ -402,9 +402,9 @@ m384 loadcompressed384_32bit(const void *ptr, m384 mvec) {
                   expand32(v[8], m[8]), expand32(v[9], m[9]),
                   expand32(v[10], m[10]), expand32(v[11], m[11]) };
 
-    m384 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]),
-                  .mid = _mm_set_epi32(x[7], x[6], x[5], x[4]),
-                  .hi = _mm_set_epi32(x[11], x[10], x[9], x[8]) };
+    m384 xvec = { .lo = set32x4(x[3], x[2], x[1], x[0]),
+                  .mid = set32x4(x[7], x[6], x[5], x[4]),
+                  .hi = set32x4(x[11], x[10], x[9], x[8]) };
     return xvec;
 }
 #endif
@@ -427,9 +427,9 @@ m384 loadcompressed384_64bit(const void *ptr, m384 mvec) {
                   expand64(v[2], m[2]), expand64(v[3], m[3]),
                   expand64(v[4], m[4]), expand64(v[5], m[5]) };
 
-    m384 xvec = { .lo = _mm_set_epi64x(x[1], x[0]),
-                  .mid = _mm_set_epi64x(x[3], x[2]),
-                  .hi = _mm_set_epi64x(x[5], x[4]) };
+    m384 xvec = { .lo = set2x64(x[1], x[0]),
+                  .mid = set2x64(x[3], x[2]),
+                  .hi = set2x64(x[5], x[4]) };
     return xvec;
 }
 #endif
@@ -548,20 +548,20 @@ m512 loadcompressed512_32bit(const void *ptr, m512 mvec) {
 
     m512 xvec;
 #if defined(HAVE_AVX512)
-    xvec = _mm512_set_epi32(x[15], x[14], x[13], x[12],
-                            x[11], x[10], x[9], x[8],
-                            x[7], x[6], x[5], x[4],
-                            x[3], x[2], x[1], x[0]);
+    xvec = set32x16(x[15], x[14], x[13], x[12],
+                    x[11], x[10], x[9], x[8],
+                    x[7], x[6], x[5], x[4],
+                    x[3], x[2], x[1], x[0]);
 #elif defined(HAVE_AVX2)
-    xvec.lo = _mm256_set_epi32(x[7], x[6], x[5], x[4],
-                               x[3], x[2], x[1], x[0]);
-    xvec.hi = _mm256_set_epi32(x[15], x[14], x[13], x[12],
-                               x[11], x[10], x[9], x[8]);
+    xvec.lo = set32x8(x[7], x[6], x[5], x[4],
+                      x[3], x[2], x[1], x[0]);
+    xvec.hi = set32x8(x[15], x[14], x[13], x[12],
+                      x[11], x[10], x[9], x[8]);
 #else
-    xvec.lo.lo = _mm_set_epi32(x[3], x[2], x[1], x[0]);
-    xvec.lo.hi = _mm_set_epi32(x[7], x[6], x[5], x[4]);
-    xvec.hi.lo = _mm_set_epi32(x[11], x[10], x[9], x[8]);
-    xvec.hi.hi = _mm_set_epi32(x[15], x[14], x[13], x[12]);
+    xvec.lo.lo = set32x4(x[3], x[2], x[1], x[0]);
+    xvec.lo.hi = set32x4(x[7], x[6], x[5], x[4]);
+    xvec.hi.lo = set32x4(x[11], x[10], x[9], x[8]);
+    xvec.hi.hi = set32x4(x[15], x[14], x[13], x[12]);
 #endif
     return xvec;
 }
@@ -588,16 +588,16 @@ m512 loadcompressed512_64bit(const void *ptr, m512 mvec) {
                   expand64(v[6], m[6]), expand64(v[7], m[7]) };
 
 #if defined(HAVE_AVX512)
-    m512 xvec = _mm512_set_epi64(x[7], x[6], x[5], x[4],
+    m512 xvec = set64x8(x[7], x[6], x[5], x[4],
                                  x[3], x[2], x[1], x[0]);
 #elif defined(HAVE_AVX2)
-    m512 xvec = { .lo = _mm256_set_epi64x(x[3], x[2], x[1], x[0]),
-                  .hi = _mm256_set_epi64x(x[7], x[6], x[5], x[4])};
+    m512 xvec = { .lo = set4x64(x[3], x[2], x[1], x[0]),
+                  .hi = set4x64(x[7], x[6], x[5], x[4])};
 #else
-    m512 xvec = { .lo = { _mm_set_epi64x(x[1], x[0]),
-                          _mm_set_epi64x(x[3], x[2]) },
-                  .hi = { _mm_set_epi64x(x[5], x[4]),
-                          _mm_set_epi64x(x[7], x[6]) } };
+    m512 xvec = { .lo = { set2x64(x[1], x[0]),
+                          set2x64(x[3], x[2]) },
+                  .hi = { set2x64(x[5], x[4]),
+                          set2x64(x[7], x[6]) } };
 #endif
     return xvec;
 }

From 04fbf2468140cc4d7ccabc62a2bdc4503a3d31c5 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <markos@freevec.org>
Date: Wed, 23 Sep 2020 21:38:12 +0300
Subject: [PATCH 13/53] Revert "move x86 popcount.h implementations to
 util/arch/x86/popcount.h"

This reverts commit 6581aae90e55520353c03edb716de80ecc03521a.
---
 src/util/arch/common/popcount.h | 60 -----------------------------
 src/util/arch/x86/popcount.h    | 67 ---------------------------------
 src/util/popcount.h             | 35 +++++++++++++----
 3 files changed, 27 insertions(+), 135 deletions(-)
 delete mode 100644 src/util/arch/common/popcount.h
 delete mode 100644 src/util/arch/x86/popcount.h

diff --git a/src/util/arch/common/popcount.h b/src/util/arch/common/popcount.h
deleted file mode 100644
index 0bd1e837..00000000
--- a/src/util/arch/common/popcount.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2015-2017, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-/** \file
- * \brief Platform specific popcount functions
- */
-
-#ifndef POPCOUNT_ARCH_COMMON_H
-#define POPCOUNT_ARCH_COMMON_H
-
-static really_inline
-u32 popcount32_impl_c(u32 x) {
-    // Fast branch-free version from bit-twiddling hacks as older Intel
-    // processors do not have a POPCNT instruction.
-    x -= (x >> 1) & 0x55555555;
-    x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
-    return (((x + (x >> 4)) & 0xf0f0f0f) * 0x1010101) >> 24;
-}
-
-static really_inline
-u32 popcount64_impl_c(u64a x) {
-#if defined(ARCH_64_BIT)
-    // Fast branch-free version from bit-twiddling hacks as older Intel
-    // processors do not have a POPCNT instruction.
-    x -= (x >> 1) & 0x5555555555555555;
-    x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333);
-    x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f;
-    return (x * 0x0101010101010101) >> 56;
-#else
-    // Synthesise from two 32-bit cases.
-    return popcount32_impl(x >> 32) + popcount32_impl(x);
-#endif
-}
-
-#endif // POPCOUNT_ARCH_COMMON_H
\ No newline at end of file
diff --git a/src/util/arch/x86/popcount.h b/src/util/arch/x86/popcount.h
deleted file mode 100644
index 86929ede..00000000
--- a/src/util/arch/x86/popcount.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2015-2017, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-/** \file
- * \brief Platform specific popcount functions
- */
-
-#ifndef POPCOUNT_ARCH_X86_H
-#define POPCOUNT_ARCH_X86_H
-
-#include "ue2common.h"
-#include "util/arch.h"
-#include "util/intrinsics.h"
-
-#include "util/arch/common/popcount.h"
-
-static really_inline
-u32 popcount32_impl(u32 x) {
-#if defined(HAVE_POPCOUNT_INSTR)
-    // Single-instruction builtin.
-    return _mm_popcnt_u32(x);
-#else
-    return popcount32_impl_c(x);
-#endif
-}
-
-static really_inline
-u32 popcount64_impl(u64a x) {
-#if defined(ARCH_X86_64)
-# if defined(HAVE_POPCOUNT_INSTR)
-    // Single-instruction builtin.
-    return (u32)_mm_popcnt_u64(x);
-# else
-    return popcount64_impl_c(x);
-# endif
-#else
-    // Synthesise from two 32-bit cases.
-    return popcount32_impl(x >> 32) + popcount32_impl(x);
-#endif
-}
-
-#endif // POPCOUNT_ARCH_X86_h
\ No newline at end of file
diff --git a/src/util/popcount.h b/src/util/popcount.h
index 932fc2cf..eb08f6b1 100644
--- a/src/util/popcount.h
+++ b/src/util/popcount.h
@@ -33,22 +33,41 @@
 #ifndef UTIL_POPCOUNT_H_
 #define UTIL_POPCOUNT_H_
 
-#include "config.h"
 #include "ue2common.h"
 #include "util/arch.h"
 
-#if defined(ARCH_IA32) || defined(ARCH_X86_64)
-#include "util/arch/x86/popcount.h"
-#endif
-
 static really_inline
 u32 popcount32(u32 x) {
-    return popcount32_impl(x);
+#if defined(HAVE_POPCOUNT_INSTR)
+    // Single-instruction builtin.
+    return _mm_popcnt_u32(x);
+#else
+    // Fast branch-free version from bit-twiddling hacks as older Intel
+    // processors do not have a POPCNT instruction.
+    x -= (x >> 1) & 0x55555555;
+    x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
+    return (((x + (x >> 4)) & 0xf0f0f0f) * 0x1010101) >> 24;
+#endif
 }
 
 static really_inline
-u32 popcount64(u32 x) {
-    return popcount64_impl(x);
+u32 popcount64(u64a x) {
+#if defined(ARCH_X86_64)
+# if defined(HAVE_POPCOUNT_INSTR)
+    // Single-instruction builtin.
+    return (u32)_mm_popcnt_u64(x);
+# else
+    // Fast branch-free version from bit-twiddling hacks as older Intel
+    // processors do not have a POPCNT instruction.
+    x -= (x >> 1) & 0x5555555555555555;
+    x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333);
+    x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f;
+    return (x * 0x0101010101010101) >> 56;
+# endif
+#else
+    // Synthesise from two 32-bit cases.
+    return popcount32(x >> 32) + popcount32(x);
+#endif
 }
 
 #endif /* UTIL_POPCOUNT_H_ */

From f0e70bc0ad13d585d44115dd4e6c1f42ce5e446b Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <markos@freevec.org>
Date: Thu, 24 Sep 2020 11:52:59 +0300
Subject: [PATCH 14/53] Revert "Revert "move x86 popcount.h implementations to
 util/arch/x86/popcount.h""

This reverts commit 04fbf2468140cc4d7ccabc62a2bdc4503a3d31c5.
---
 src/util/arch/common/popcount.h | 60 +++++++++++++++++++++++++++++
 src/util/arch/x86/popcount.h    | 67 +++++++++++++++++++++++++++++++++
 src/util/popcount.h             | 35 ++++-------------
 3 files changed, 135 insertions(+), 27 deletions(-)
 create mode 100644 src/util/arch/common/popcount.h
 create mode 100644 src/util/arch/x86/popcount.h

diff --git a/src/util/arch/common/popcount.h b/src/util/arch/common/popcount.h
new file mode 100644
index 00000000..0bd1e837
--- /dev/null
+++ b/src/util/arch/common/popcount.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Platform specific popcount functions
+ */
+
+#ifndef POPCOUNT_ARCH_COMMON_H
+#define POPCOUNT_ARCH_COMMON_H
+
+static really_inline
+u32 popcount32_impl_c(u32 x) {
+    // Fast branch-free version from bit-twiddling hacks as older Intel
+    // processors do not have a POPCNT instruction.
+    x -= (x >> 1) & 0x55555555;
+    x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
+    return (((x + (x >> 4)) & 0xf0f0f0f) * 0x1010101) >> 24;
+}
+
+static really_inline
+u32 popcount64_impl_c(u64a x) {
+#if defined(ARCH_64_BIT)
+    // Fast branch-free version from bit-twiddling hacks as older Intel
+    // processors do not have a POPCNT instruction.
+    x -= (x >> 1) & 0x5555555555555555;
+    x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333);
+    x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f;
+    return (x * 0x0101010101010101) >> 56;
+#else
+    // Synthesise from two 32-bit cases.
+    return popcount32_impl(x >> 32) + popcount32_impl(x);
+#endif
+}
+
+#endif // POPCOUNT_ARCH_COMMON_H
\ No newline at end of file
diff --git a/src/util/arch/x86/popcount.h b/src/util/arch/x86/popcount.h
new file mode 100644
index 00000000..86929ede
--- /dev/null
+++ b/src/util/arch/x86/popcount.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Platform specific popcount functions
+ */
+
+#ifndef POPCOUNT_ARCH_X86_H
+#define POPCOUNT_ARCH_X86_H
+
+#include "ue2common.h"
+#include "util/arch.h"
+#include "util/intrinsics.h"
+
+#include "util/arch/common/popcount.h"
+
+static really_inline
+u32 popcount32_impl(u32 x) {
+#if defined(HAVE_POPCOUNT_INSTR)
+    // Single-instruction builtin.
+    return _mm_popcnt_u32(x);
+#else
+    return popcount32_impl_c(x);
+#endif
+}
+
+static really_inline
+u32 popcount64_impl(u64a x) {
+#if defined(ARCH_X86_64)
+# if defined(HAVE_POPCOUNT_INSTR)
+    // Single-instruction builtin.
+    return (u32)_mm_popcnt_u64(x);
+# else
+    return popcount64_impl_c(x);
+# endif
+#else
+    // Synthesise from two 32-bit cases.
+    return popcount32_impl(x >> 32) + popcount32_impl(x);
+#endif
+}
+
+#endif // POPCOUNT_ARCH_X86_h
\ No newline at end of file
diff --git a/src/util/popcount.h b/src/util/popcount.h
index eb08f6b1..932fc2cf 100644
--- a/src/util/popcount.h
+++ b/src/util/popcount.h
@@ -33,41 +33,22 @@
 #ifndef UTIL_POPCOUNT_H_
 #define UTIL_POPCOUNT_H_
 
+#include "config.h"
 #include "ue2common.h"
 #include "util/arch.h"
 
+#if defined(ARCH_IA32) || defined(ARCH_X86_64)
+#include "util/arch/x86/popcount.h"
+#endif
+
 static really_inline
 u32 popcount32(u32 x) {
-#if defined(HAVE_POPCOUNT_INSTR)
-    // Single-instruction builtin.
-    return _mm_popcnt_u32(x);
-#else
-    // Fast branch-free version from bit-twiddling hacks as older Intel
-    // processors do not have a POPCNT instruction.
-    x -= (x >> 1) & 0x55555555;
-    x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
-    return (((x + (x >> 4)) & 0xf0f0f0f) * 0x1010101) >> 24;
-#endif
+    return popcount32_impl(x);
 }
 
 static really_inline
-u32 popcount64(u64a x) {
-#if defined(ARCH_X86_64)
-# if defined(HAVE_POPCOUNT_INSTR)
-    // Single-instruction builtin.
-    return (u32)_mm_popcnt_u64(x);
-# else
-    // Fast branch-free version from bit-twiddling hacks as older Intel
-    // processors do not have a POPCNT instruction.
-    x -= (x >> 1) & 0x5555555555555555;
-    x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333);
-    x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f;
-    return (x * 0x0101010101010101) >> 56;
-# endif
-#else
-    // Synthesise from two 32-bit cases.
-    return popcount32(x >> 32) + popcount32(x);
-#endif
+u32 popcount64(u32 x) {
+    return popcount64_impl(x);
 }
 
 #endif /* UTIL_POPCOUNT_H_ */

From b1170bcc2e54b428ed0fa63802c0aced62b4b8c7 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Tue, 6 Oct 2020 08:09:18 +0300
Subject: [PATCH 15/53] add arm checks in platform.cmake

---
 cmake/platform.cmake | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/cmake/platform.cmake b/cmake/platform.cmake
index 593c544b..8c82da2b 100644
--- a/cmake/platform.cmake
+++ b/cmake/platform.cmake
@@ -1,9 +1,15 @@
 # determine the target arch
 
 # really only interested in the preprocessor here
-CHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_64_BIT)
+CHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_X86_64)
 
-CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_32_BIT)
+CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_IA32)
 
-set(ARCH_X86_64 ${ARCH_64_BIT})
-set(ARCH_IA32 ${ARCH_32_BIT})
+CHECK_C_SOURCE_COMPILES("#if !defined(__aarch64__)\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_ARM64)
+CHECK_C_SOURCE_COMPILES("#if !(defined(__arm__) && !defined(__aarch64__))\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_ARM32)
+
+if (DEFINED(ARCH_X86_64) OR DEFINED(ARCH_ARM64))
+  set(ARCH_64_BIT TRUE)
+else()
+  set(ARCH_32_BIT TRUE)
+endif()

From 5952c64066dc147b3a73024c572f416ba2d125cd Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Tue, 6 Oct 2020 12:44:23 +0300
Subject: [PATCH 16/53] add necessary modifications to CMake system to enable
 building on ARM, add arm_neon.h intrinsic header to intrinsics.h

---
 CMakeLists.txt        | 14 ++++++++-----
 cmake/arch.cmake      | 46 +++++++++++++++++++++++++++++--------------
 cmake/config.h.in     |  9 +++++++++
 cmake/platform.cmake  |  4 ++--
 src/util/intrinsics.h |  6 ++++++
 5 files changed, 57 insertions(+), 22 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e5078848..f4d1cc9f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -175,7 +175,7 @@ else()
         string(REGEX REPLACE "-O[^ ]*" "" CMAKE_CXX_FLAGS_${CONFIG} "${CMAKE_CXX_FLAGS_${CONFIG}}")
     endforeach ()
 
-    if (CMAKE_COMPILER_IS_GNUCC)
+    if (ARCH_IA32 OR ARCH_X86_64 AND CMAKE_COMPILER_IS_GNUCC)
         message(STATUS "gcc version ${CMAKE_C_COMPILER_VERSION}")
         # If gcc doesn't recognise the host cpu, then mtune=native becomes
         # generic, which isn't very good in some cases. march=native looks at
@@ -281,10 +281,14 @@ else()
 endif()
 
 CHECK_INCLUDE_FILES(unistd.h HAVE_UNISTD_H)
-CHECK_INCLUDE_FILES(intrin.h HAVE_C_INTRIN_H)
-CHECK_INCLUDE_FILE_CXX(intrin.h HAVE_CXX_INTRIN_H)
-CHECK_INCLUDE_FILES(x86intrin.h HAVE_C_X86INTRIN_H)
-CHECK_INCLUDE_FILE_CXX(x86intrin.h HAVE_CXX_X86INTRIN_H)
+if (ARCH_IA32 OR ARCH_X86_64)
+  CHECK_INCLUDE_FILES(intrin.h HAVE_C_INTRIN_H)
+  CHECK_INCLUDE_FILE_CXX(intrin.h HAVE_CXX_INTRIN_H)
+  CHECK_INCLUDE_FILES(x86intrin.h HAVE_C_X86INTRIN_H)
+  CHECK_INCLUDE_FILE_CXX(x86intrin.h HAVE_CXX_X86INTRIN_H)
+elseif (ARCH_ARM32 OR ARCH_AARCH64)
+  CHECK_INCLUDE_FILE_CXX(arm_neon.h HAVE_C_ARM_NEON_H)
+endif()
 
 CHECK_FUNCTION_EXISTS(posix_memalign HAVE_POSIX_MEMALIGN)
 CHECK_FUNCTION_EXISTS(_aligned_malloc HAVE__ALIGNED_MALLOC)
diff --git a/cmake/arch.cmake b/cmake/arch.cmake
index cced49c6..e3cc9f44 100644
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@@ -6,7 +6,10 @@ if (HAVE_C_X86INTRIN_H)
     set (INTRIN_INC_H "x86intrin.h")
 elseif (HAVE_C_INTRIN_H)
     set (INTRIN_INC_H "intrin.h")
-else ()
+elseif (HAVE_C_ARM_NEON_H)
+    set (INTRIN_INC_H "arm_neon.h")
+    set (FAT_RUNTIME OFF)
+else()
     message (FATAL_ERROR "No intrinsics header found")
 endif ()
 
@@ -29,15 +32,16 @@ else (NOT FAT_RUNTIME)
     set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} ${ARCH_C_FLAGS}")
 endif ()
 
-# ensure we have the minimum of SSSE3 - call a SSSE3 intrinsic
-CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
+if (ARCH_IA32 OR ARCH_X86_64)
+    # ensure we have the minimum of SSSE3 - call a SSSE3 intrinsic
+    CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
 int main() {
     __m128i a = _mm_set1_epi8(1);
     (void)_mm_shuffle_epi8(a, a);
 }" HAVE_SSSE3)
 
-# now look for AVX2
-CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
+    # now look for AVX2
+    CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
 #if !defined(__AVX2__)
 #error no avx2
 #endif
@@ -47,8 +51,8 @@ int main(){
     (void)_mm256_xor_si256(z, z);
 }" HAVE_AVX2)
 
-# and now for AVX512
-CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
+    # and now for AVX512
+    CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
 #if !defined(__AVX512BW__)
 #error no avx512bw
 #endif
@@ -58,8 +62,8 @@ int main(){
     (void)_mm512_abs_epi8(z);
 }" HAVE_AVX512)
 
-# and now for AVX512VBMI
-CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
+    # and now for AVX512VBMI
+    CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
 #if !defined(__AVX512VBMI__)
 #error no avx512vbmi
 #endif
@@ -70,26 +74,38 @@ int main(){
     (void)_mm512_permutexvar_epi8(idx, a);
 }" HAVE_AVX512VBMI)
 
+elseif (ARCH_ARM32 OR ARCH_AARCH64)
+    CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
+int main() {
+    int32x4_t a = vdupq_n_s32(1);
+}" HAVE_NEON)
+else ()
+    message (FATAL_ERROR "Unsupported architecture")
+endif ()
+
 if (FAT_RUNTIME)
-    if (NOT HAVE_SSSE3)
+    if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_SSSE3)
         message(FATAL_ERROR "SSSE3 support required to build fat runtime")
     endif ()
-    if (NOT HAVE_AVX2)
+    if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_AVX2)
         message(FATAL_ERROR "AVX2 support required to build fat runtime")
     endif ()
-    if (BUILD_AVX512 AND NOT HAVE_AVX512)
+    if ((ARCH_IA32 OR ARCH_X86_64) AND BUILD_AVX512 AND NOT HAVE_AVX512)
         message(FATAL_ERROR "AVX512 support requested but not supported")
     endif ()
 else (NOT FAT_RUNTIME)
-    if (NOT HAVE_AVX2)
+    if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_AVX2)
         message(STATUS "Building without AVX2 support")
     endif ()
-    if (NOT HAVE_AVX512)
+    if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_AVX512)
         message(STATUS "Building without AVX512 support")
     endif ()
-    if (NOT HAVE_SSSE3)
+    if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_SSSE3)
         message(FATAL_ERROR "A minimum of SSSE3 compiler support is required")
     endif ()
+    if ((ARCH_ARM32 OR ARCH_AARCH64) AND NOT HAVE_NEON)
+        message(FATAL_ERROR "NEON support required for ARM support")
+    endif ()
 endif ()
 
 unset (CMAKE_REQUIRED_FLAGS)
diff --git a/cmake/config.h.in b/cmake/config.h.in
index 203f0afd..2d2c78ce 100644
--- a/cmake/config.h.in
+++ b/cmake/config.h.in
@@ -15,6 +15,12 @@
 /* "Define if building for EM64T" */
 #cmakedefine ARCH_X86_64
 
+/* "Define if building for ARM32" */
+#cmakedefine ARCH_ARM32
+
+/* "Define if building for AARCH64" */
+#cmakedefine ARCH_AARCH64
+
 /* internal build, switch on dump support. */
 #cmakedefine DUMP_SUPPORT
 
@@ -45,6 +51,9 @@
 /* C compiler has intrin.h */
 #cmakedefine HAVE_C_INTRIN_H
 
+/* C compiler has arm_neon.h */
+#cmakedefine HAVE_C_ARM_NEON_H
+
 /* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to
    0 if you don't. */
 #cmakedefine HAVE_DECL_PTHREAD_SETAFFINITY_NP
diff --git a/cmake/platform.cmake b/cmake/platform.cmake
index 8c82da2b..4591bf93 100644
--- a/cmake/platform.cmake
+++ b/cmake/platform.cmake
@@ -5,10 +5,10 @@ CHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error n
 
 CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_IA32)
 
-CHECK_C_SOURCE_COMPILES("#if !defined(__aarch64__)\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_ARM64)
+CHECK_C_SOURCE_COMPILES("#if !defined(__aarch64__)\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_AARCH64)
 CHECK_C_SOURCE_COMPILES("#if !(defined(__arm__) && !defined(__aarch64__))\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_ARM32)
 
-if (DEFINED(ARCH_X86_64) OR DEFINED(ARCH_ARM64))
+if (DEFINED(ARCH_X86_64) OR DEFINED(ARCH_AARCH64))
   set(ARCH_64_BIT TRUE)
 else()
   set(ARCH_32_BIT TRUE)
diff --git a/src/util/intrinsics.h b/src/util/intrinsics.h
index edc4f6ef..3e2afc22 100644
--- a/src/util/intrinsics.h
+++ b/src/util/intrinsics.h
@@ -45,6 +45,10 @@
 # endif
 #endif
 
+#if defined(HAVE_C_ARM_NEON_H)
+#  define USE_ARM_NEON_H
+#endif
+
 #ifdef __cplusplus
 # if defined(HAVE_CXX_INTRIN_H)
 #  define USE_INTRIN_H
@@ -59,6 +63,8 @@
 #include <x86intrin.h>
 #elif defined(USE_INTRIN_H)
 #include <intrin.h>
+#elif defined(USE_ARM_NEON_H)
+#include <arm_neon.h>
 #else
 #error no intrinsics file
 #endif

From e91082d477a659bfc6f100f2a7ffd029553d2f3e Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <markos@freevec.org>
Date: Tue, 6 Oct 2020 13:45:52 +0300
Subject: [PATCH 17/53] use right intrinsic

---
 src/util/state_compress.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/util/state_compress.c b/src/util/state_compress.c
index e6cf205c..87eccce7 100644
--- a/src/util/state_compress.c
+++ b/src/util/state_compress.c
@@ -150,7 +150,7 @@ m128 loadcompressed128_32bit(const void *ptr, m128 mvec) {
     u32 x[4] = { expand32(v[0], m[0]), expand32(v[1], m[1]),
                  expand32(v[2], m[2]), expand32(v[3], m[3]) };
 
-    return set32x4(x[3], x[2], x[1], x[0]);
+    return set4x32(x[3], x[2], x[1], x[0]);
 }
 #endif
 
@@ -264,10 +264,10 @@ m256 loadcompressed256_32bit(const void *ptr, m256 mvec) {
                  expand32(v[6], m[6]), expand32(v[7], m[7]) };
 
 #if !defined(HAVE_AVX2)
-    m256 xvec = { .lo = set32x4(x[3], x[2], x[1], x[0]),
-                  .hi = set32x4(x[7], x[6], x[5], x[4]) };
+    m256 xvec = { .lo = set4x32(x[3], x[2], x[1], x[0]),
+                  .hi = set4x32(x[7], x[6], x[5], x[4]) };
 #else
-    m256 xvec = set32x8(x[7], x[6], x[5], x[4],
+    m256 xvec = set8x32(x[7], x[6], x[5], x[4],
                         x[3], x[2], x[1], x[0]);
 #endif
     return xvec;
@@ -402,9 +402,9 @@ m384 loadcompressed384_32bit(const void *ptr, m384 mvec) {
                   expand32(v[8], m[8]), expand32(v[9], m[9]),
                   expand32(v[10], m[10]), expand32(v[11], m[11]) };
 
-    m384 xvec = { .lo = set32x4(x[3], x[2], x[1], x[0]),
-                  .mid = set32x4(x[7], x[6], x[5], x[4]),
-                  .hi = set32x4(x[11], x[10], x[9], x[8]) };
+    m384 xvec = { .lo = set4x32(x[3], x[2], x[1], x[0]),
+                  .mid = set4x32(x[7], x[6], x[5], x[4]),
+                  .hi = set4x32(x[11], x[10], x[9], x[8]) };
     return xvec;
 }
 #endif
@@ -553,15 +553,15 @@ m512 loadcompressed512_32bit(const void *ptr, m512 mvec) {
                     x[7], x[6], x[5], x[4],
                     x[3], x[2], x[1], x[0]);
 #elif defined(HAVE_AVX2)
-    xvec.lo = set32x8(x[7], x[6], x[5], x[4],
+    xvec.lo = set8x32(x[7], x[6], x[5], x[4],
                       x[3], x[2], x[1], x[0]);
-    xvec.hi = set32x8(x[15], x[14], x[13], x[12],
+    xvec.hi = set8x32(x[15], x[14], x[13], x[12],
                       x[11], x[10], x[9], x[8]);
 #else
-    xvec.lo.lo = set32x4(x[3], x[2], x[1], x[0]);
-    xvec.lo.hi = set32x4(x[7], x[6], x[5], x[4]);
-    xvec.hi.lo = set32x4(x[11], x[10], x[9], x[8]);
-    xvec.hi.hi = set32x4(x[15], x[14], x[13], x[12]);
+    xvec.lo.lo = set4x32(x[3], x[2], x[1], x[0]);
+    xvec.lo.hi = set4x32(x[7], x[6], x[5], x[4]);
+    xvec.hi.lo = set4x32(x[11], x[10], x[9], x[8]);
+    xvec.hi.hi = set4x32(x[15], x[14], x[13], x[12]);
 #endif
     return xvec;
 }

From 9a0494259efbce2654da3b0b9f4978749383a715 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Wed, 7 Oct 2020 14:26:41 +0300
Subject: [PATCH 18/53] minor fix

---
 src/util/arch/x86/simd_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/util/arch/x86/simd_types.h b/src/util/arch/x86/simd_types.h
index a582abd5..d74493b4 100644
--- a/src/util/arch/x86/simd_types.h
+++ b/src/util/arch/x86/simd_types.h
@@ -41,5 +41,5 @@ typedef __m256i m256;
 typedef __m512i m512;
 #endif
 
-#endif /* SIMD_TYPES_H */
+#endif /* SIMD_TYPES_X86_H */
 

From 4c924cc920ad4dce46e30a6e6fb40d0b59817787 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Wed, 7 Oct 2020 14:28:12 +0300
Subject: [PATCH 19/53] add arm architecture basic defines

---
 src/util/arch.h                |  6 ++++-
 src/util/arch/arm/arm.h        | 42 ++++++++++++++++++++++++++++++++++
 src/util/arch/arm/simd_types.h | 37 ++++++++++++++++++++++++++++++
 src/util/simd_types.h          |  4 +++-
 4 files changed, 87 insertions(+), 2 deletions(-)
 create mode 100644 src/util/arch/arm/arm.h
 create mode 100644 src/util/arch/arm/simd_types.h

diff --git a/src/util/arch.h b/src/util/arch.h
index 57e39c07..794f28f7 100644
--- a/src/util/arch.h
+++ b/src/util/arch.h
@@ -33,8 +33,12 @@
 #ifndef UTIL_ARCH_H_
 #define UTIL_ARCH_H_
 
-#if defined(__i386__) || defined(__x86_64__)
+#include "config.h"
+
+#if defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "util/arch/x86/x86.h"
+#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
+#include "util/arch/arm/arm.h"
 #endif
 
 #endif // UTIL_ARCH_X86_H_
diff --git a/src/util/arch/arm/arm.h b/src/util/arch/arm/arm.h
new file mode 100644
index 00000000..326e8f56
--- /dev/null
+++ b/src/util/arch/arm/arm.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2017-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Per-platform architecture definitions
+ */
+
+#ifndef UTIL_ARCH_ARM_H_
+#define UTIL_ARCH_ARM_H_
+
+#if defined(__ARM_NEON) && (defined(ARCH_ARM32) || defined(ARCH_AARCH64))
+#define HAVE_NEON
+#define HAVE_SIMD_128_BITS
+#endif
+
+#endif // UTIL_ARCH_ARM_H_
+
diff --git a/src/util/arch/arm/simd_types.h b/src/util/arch/arm/simd_types.h
new file mode 100644
index 00000000..cc4c50e4
--- /dev/null
+++ b/src/util/arch/arm/simd_types.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SIMD_TYPES_ARM_H
+#define SIMD_TYPES_ARM_H
+
+#if !defined(m128) && defined(HAVE_NEON)
+typedef int32x4_t m128;
+#endif
+
+#endif /* SIMD_TYPES_ARM_H */
+
diff --git a/src/util/simd_types.h b/src/util/simd_types.h
index a58ede4d..5777374b 100644
--- a/src/util/simd_types.h
+++ b/src/util/simd_types.h
@@ -34,8 +34,10 @@
 #include "util/intrinsics.h"
 #include "ue2common.h"
 
-#if defined(__i386__) || defined(__x86_64__)
+#if defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "util/arch/x86/simd_types.h"
+#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
+#include "util/arch/arm/simd_types.h"
 #endif
 
 #if !defined(m128) && !defined(HAVE_SIMD_128_BITS)

From 5d773dd9db21e2f753ce386bfcf53e69c5113abe Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Wed, 7 Oct 2020 14:28:45 +0300
Subject: [PATCH 20/53] use C implementation of popcount for arm

---
 src/util/arch/common/popcount.h | 4 ++--
 src/util/popcount.h             | 4 ++++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/util/arch/common/popcount.h b/src/util/arch/common/popcount.h
index 0bd1e837..ef5776e8 100644
--- a/src/util/arch/common/popcount.h
+++ b/src/util/arch/common/popcount.h
@@ -53,8 +53,8 @@ u32 popcount64_impl_c(u64a x) {
     return (x * 0x0101010101010101) >> 56;
 #else
     // Synthesise from two 32-bit cases.
-    return popcount32_impl(x >> 32) + popcount32_impl(x);
+    return popcount32_impl_c(x >> 32) + popcount32_impl_c(x);
 #endif
 }
 
-#endif // POPCOUNT_ARCH_COMMON_H
\ No newline at end of file
+#endif // POPCOUNT_ARCH_COMMON_H
diff --git a/src/util/popcount.h b/src/util/popcount.h
index 932fc2cf..5fd6dc33 100644
--- a/src/util/popcount.h
+++ b/src/util/popcount.h
@@ -39,6 +39,10 @@
 
 #if defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "util/arch/x86/popcount.h"
+#else
+#include "util/arch/common/popcount.h"
+#define popcount32_impl(x) popcount32_impl_c(x)
+#define popcount64_impl(x) popcount64_impl_c(x)
 #endif
 
 static really_inline

From d2cf1a7882d5f162fff756086bd2178a58c42cbc Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Thu, 8 Oct 2020 20:48:20 +0300
Subject: [PATCH 21/53] move cpuid_flags.h header to common

---
 CMakeLists.txt                              | 2 +-
 src/hs.cpp                                  | 3 ++-
 src/util/arch/{x86 => common}/cpuid_flags.h | 2 +-
 src/util/target_info.cpp                    | 5 +++--
 4 files changed, 7 insertions(+), 5 deletions(-)
 rename src/util/arch/{x86 => common}/cpuid_flags.h (95%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f4d1cc9f..c1db4dfa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -569,7 +569,7 @@ set (hs_exec_common_SRCS
     src/alloc.c
     src/scratch.c
     src/util/arch/x86/cpuid_flags.c
-    src/util/arch/x86/cpuid_flags.h
+    src/util/arch/common/cpuid_flags.h
     src/util/multibit.c
     )
 
diff --git a/src/hs.cpp b/src/hs.cpp
index a0cb9bb3..7898cf46 100644
--- a/src/hs.cpp
+++ b/src/hs.cpp
@@ -44,9 +44,10 @@
 #include "parser/prefilter.h"
 #include "parser/unsupported.h"
 #include "util/compile_error.h"
+#include "util/arch/common/cpuid_flags.h"
 #if defined(ARCH_X86_64)
-#include "util/arch/x86/cpuid_flags.h"
 #include "util/arch/x86/cpuid_inline.h"
+#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
 #endif
 #include "util/depth.h"
 #include "util/popcount.h"
diff --git a/src/util/arch/x86/cpuid_flags.h b/src/util/arch/common/cpuid_flags.h
similarity index 95%
rename from src/util/arch/x86/cpuid_flags.h
rename to src/util/arch/common/cpuid_flags.h
index 527c6d52..68e427dd 100644
--- a/src/util/arch/x86/cpuid_flags.h
+++ b/src/util/arch/common/cpuid_flags.h
@@ -31,7 +31,7 @@
 
 #include "ue2common.h"
 
-#if !defined(_WIN32) && !defined(CPUID_H_)
+#if (defined(ARCH_IA32) || defined(ARCH_X86_64)) && !defined(_WIN32) && !defined(CPUID_H_)
 #include <cpuid.h>
  /* system header doesn't have a header guard */
 #define CPUID_H_
diff --git a/src/util/target_info.cpp b/src/util/target_info.cpp
index 6eab701d..5253755b 100644
--- a/src/util/target_info.cpp
+++ b/src/util/target_info.cpp
@@ -29,8 +29,9 @@
 
 #include "hs_compile.h" // for various hs_platform_info flags
 #include "target_info.h"
-#if defined(ARCH_X86_64)
-#include "util/arch/x86/cpuid_flags.h"
+#include "util/arch/common/cpuid_flags.h"
+#if defined(ARCH_IA32) || defined(ARCH_X86_64)
+#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
 #endif
 
 namespace ue2 {

From 1c2c73becfa9ee26f2c468445d10e3ae638b0243 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Thu, 8 Oct 2020 20:50:18 +0300
Subject: [PATCH 22/53] add C implementation of pdep64()

---
 src/util/arch/common/bitutils.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/src/util/arch/common/bitutils.h b/src/util/arch/common/bitutils.h
index f2706d70..e86b8d44 100644
--- a/src/util/arch/common/bitutils.h
+++ b/src/util/arch/common/bitutils.h
@@ -351,6 +351,36 @@ u64a pext64_impl_c(u64a x, u64a mask) {
     return result;
 }
 
+static really_inline
+u64a pdep64_impl_c(u64a x, u64a _m) {
+    /* Taken from:
+     * https://gcc.gnu.org/legacy-ml/gcc-patches/2017-06/msg01408.html
+     */
+
+    u64a result = 0x0UL;
+    const u64a mask = 0x8000000000000000UL;
+    u64a m = _m;
+    u64a c, t;
+    u64a p;
+
+    /* The pop-count of the mask gives the number of the bits from
+     source to process.  This is also needed to shift bits from the
+     source into the correct position for the result.  */
+    p = 64 - __builtin_popcountl (_m);
+
+    /* The loop is for the number of '1' bits in the mask and clearing
+     each mask bit as it is processed.  */
+    while (m != 0)
+    {
+        c = __builtin_clzl (m);
+        t = x << (p - c);
+        m ^= (mask >> c);
+        result |= (t & (mask >> c));
+        p++;
+    }
+    return (result);
+}
+
 /* compilers don't reliably synthesize the 32-bit ANDN instruction here,
  * so we force its generation.
  */

From a9212174eee2ffabc261b3323719c0a06640f83e Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Thu, 8 Oct 2020 20:50:55 +0300
Subject: [PATCH 23/53] add arm bitutils.h header

---
 src/util/arch/arm/bitutils.h | 179 +++++++++++++++++++++++++++++++++++
 src/util/bitutils.h          |   2 +
 2 files changed, 181 insertions(+)
 create mode 100644 src/util/arch/arm/bitutils.h

diff --git a/src/util/arch/arm/bitutils.h b/src/util/arch/arm/bitutils.h
new file mode 100644
index 00000000..514ddc5c
--- /dev/null
+++ b/src/util/arch/arm/bitutils.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Bit-twiddling primitives (ctz, compress etc)
+ */
+
+#ifndef BITUTILS_ARCH_ARM_H
+#define BITUTILS_ARCH_ARM_H
+
+#include "ue2common.h"
+#include "util/popcount.h"
+#include "util/arch.h"
+#include "util/intrinsics.h"
+
+#include "util/arch/common/bitutils.h"
+
+static really_inline
+u32 clz32_impl(u32 x) {
+    return clz32_impl_c(x);
+}
+
+static really_inline
+u32 clz64_impl(u64a x) {
+    return clz64_impl_c(x);
+}
+
+static really_inline
+u32 ctz32_impl(u32 x) {
+    return ctz32_impl_c(x);
+}
+
+static really_inline
+u32 ctz64_impl(u64a x) {
+    return ctz64_impl_c(x);
+}
+
+static really_inline
+u32 lg2_impl(u32 x) {
+    return lg2_impl_c(x);
+}
+
+static really_inline
+u64a lg2_64_impl(u64a x) {
+    return lg2_64_impl_c(x);
+}
+
+static really_inline
+u32 findAndClearLSB_32_impl(u32 *v) {
+    return findAndClearLSB_32_impl_c(v);
+}
+
+static really_inline
+u32 findAndClearLSB_64_impl(u64a *v) {
+    return findAndClearLSB_64_impl_c(v);
+}
+
+static really_inline
+u32 findAndClearMSB_32_impl(u32 *v) {
+    u32 val = *v;
+    u32 offset = 31 - clz32_impl(val);
+    *v = val & ~(1 << offset);
+    assert(offset < 32);
+    return offset;
+}
+
+static really_inline
+u32 findAndClearMSB_64_impl(u64a *v) {
+    return findAndClearMSB_64_impl_c(v);
+}
+
+static really_inline
+u32 compress32_impl(u32 x, u32 m) {
+    return compress32_impl_c(x, m);
+}
+
+static really_inline
+u64a compress64_impl(u64a x, u64a m) {
+    return compress64_impl_c(x, m);
+}
+
+static really_inline
+u32 expand32_impl(u32 x, u32 m) {
+    return expand32_impl_c(x, m);
+}
+
+static really_inline
+u64a expand64_impl(u64a x, u64a m) {
+    return expand64_impl_c(x, m);
+}
+
+/* returns the first set bit after begin (if not ~0U). If no bit is set after
+ * begin returns ~0U
+ */
+static really_inline
+u32 bf64_iterate_impl(u64a bitfield, u32 begin) {
+    if (begin != ~0U) {
+        /* switch off all bits at or below begin. Note: not legal to shift by
+         * by size of the datatype or larger. */
+        assert(begin <= 63);
+        bitfield &= ~((2ULL << begin) - 1);
+    }
+
+    if (!bitfield) {
+        return ~0U;
+    }
+
+    return ctz64_impl(bitfield);
+}
+
+static really_inline
+char bf64_set_impl(u64a *bitfield, u32 i) {
+    return bf64_set_impl_c(bitfield, i);
+}
+
+static really_inline
+void bf64_unset_impl(u64a *bitfield, u32 i) {
+    return bf64_unset_impl_c(bitfield, i);
+}
+
+static really_inline
+u32 rank_in_mask32_impl(u32 mask, u32 bit) {
+    return rank_in_mask32_impl_c(mask, bit);
+}
+
+static really_inline
+u32 rank_in_mask64_impl(u64a mask, u32 bit) {
+    return rank_in_mask64_impl_c(mask, bit);
+}
+
+static really_inline
+u32 pext32_impl(u32 x, u32 mask) {
+    return pext32_impl_c(x, mask);
+}
+
+static really_inline
+u64a pext64_impl(u64a x, u64a mask) {
+    return pext64_impl_c(x, mask);
+}
+
+static really_inline
+u64a pdep64(u64a x, u64a mask) {
+    return pdep64_impl_c(x, mask);
+}
+
+/* compilers don't reliably synthesize the 32-bit ANDN instruction here,
+ * so we force its generation.
+ */
+static really_inline
+u64a andn_impl(const u32 a, const u8 *b) {
+    return andn_impl_c(a, b);
+}
+
+#endif // BITUTILS_ARCH_ARM_H
diff --git a/src/util/bitutils.h b/src/util/bitutils.h
index 7373a9c8..556ba818 100644
--- a/src/util/bitutils.h
+++ b/src/util/bitutils.h
@@ -47,6 +47,8 @@
 
 #if defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "util/arch/x86/bitutils.h"
+#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
+#include "util/arch/arm/bitutils.h"
 #endif
 
 static really_inline

From 31ac6718dd26f9b7b6e1319a7f55ae7be507a508 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Tue, 13 Oct 2020 09:19:56 +0300
Subject: [PATCH 24/53] add ARM version of simd_utils.h

---
 src/util/arch/arm/simd_utils.h | 288 +++++++++++++++++++++++++++++++++
 src/util/simd_utils.h          |   2 +
 2 files changed, 290 insertions(+)
 create mode 100644 src/util/arch/arm/simd_utils.h

diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h
new file mode 100644
index 00000000..606892fb
--- /dev/null
+++ b/src/util/arch/arm/simd_utils.h
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2015-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief SIMD types and primitive operations.
+ */
+
+#ifndef ARCH_ARM_SIMD_UTILS_H
+#define ARCH_ARM_SIMD_UTILS_H
+
+#include "ue2common.h"
+#include "util/simd_types.h"
+#include "util/unaligned.h"
+#include "util/intrinsics.h"
+
+#include <string.h> // for memcpy
+
+static really_inline m128 ones128(void) {
+    return (m128) vdupq_n_s32(0xFF);
+}
+
+static really_inline m128 zeroes128(void) {
+    return (m128) vdupq_n_s32(0);
+}
+
+/** \brief Bitwise not for m128*/
+static really_inline m128 not128(m128 a) {
+    return (m128) veorq_s32(a, a);
+}
+
+/** \brief Return 1 if a and b are different otherwise 0 */
+static really_inline int diff128(m128 a, m128 b) {
+    m128 t = (m128)vceqq_s8((int8x16_t)a, (int8x16_t)b);
+    return (16 != vaddvq_u8((uint8x16_t)t));
+}
+
+static really_inline int isnonzero128(m128 a) {
+    return !!diff128(a, zeroes128());
+}
+
+/**
+ * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit
+ * mask indicating which 32-bit words contain differences.
+ */
+static really_inline u32 diffrich128(m128 a, m128 b) {
+    static const uint32x4_t movemask = { 1, 2, 4, 8 };
+    return vaddvq_u32(vandq_u32(vceqq_s32((int32x4_t)a, (int32x4_t)b), movemask));
+}
+
+/**
+ * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and
+ * returns a 4-bit mask indicating which 64-bit words contain differences.
+ */
+static really_inline u32 diffrich64_128(m128 a, m128 b) {
+    static const uint64x2_t movemask = { 1, 2 };
+    return vaddvq_u64(vandq_u64(vceqq_s64((int64x2_t)a, (int64x2_t)b), movemask));
+}
+
+static really_really_inline
+m128 lshift64_m128(m128 a, unsigned b) {
+    return (m128) vshlq_n_s64((int64x2_t)a, b);
+}
+
+static really_really_inline
+m128 rshift64_m128(m128 a, unsigned b) {
+    return (m128) vshrq_n_s64((int64x2_t)a, b);
+}
+
+static really_inline m128 eq128(m128 a, m128 b) {
+    return (m128) vceqq_s8((int8x16_t)a, (int8x16_t)b);
+}
+
+#define movemask128(a)  ((u32)_mm_movemask_epi8((a)))
+
+static really_inline m128 set1_16x8(u8 c) {
+    return (m128) vdupq_n_u8(c);
+}
+
+static really_inline m128 set1_4x32(u32 c) {
+    return (m128) vdupq_n_u32(c);
+}
+
+static really_inline m128 set1_2x64(u64a c) {
+    return (m128) vdupq_n_u64(c);
+}
+
+static really_inline u32 movd(const m128 in) {
+    return vgetq_lane_u32((uint32x4_t) in, 0);
+}
+
+static really_inline u64a movq(const m128 in) {
+    return vgetq_lane_u64((uint64x2_t) in, 0);
+}
+
+/* another form of movq */
+static really_inline
+m128 load_m128_from_u64a(const u64a *p) {
+    return (m128) vdupq_n_u64(*p);
+}
+
+static really_really_inline
+m128 rshiftbyte_m128(m128 a, unsigned b) {
+    return (m128) vshrq_n_s8((int8x16_t)a, b);
+}
+
+static really_really_inline
+m128 lshiftbyte_m128(m128 a, unsigned b) {
+    return (m128) vshlq_n_s8((int8x16_t)a, b);
+}
+
+static really_inline u32 extract32from128(const m128 in, unsigned imm) {
+    return vgetq_lane_u32((uint32x4_t) in, imm);
+}
+
+static really_inline u32 extract64from128(const m128 in, unsigned imm) {
+    return vgetq_lane_u64((uint64x2_t) in, imm);
+}
+
+static really_inline m128 and128(m128 a, m128 b) {
+    return (m128) vandq_s8((int8x16_t)a, (int8x16_t)b);
+}
+
+static really_inline m128 xor128(m128 a, m128 b) {
+    return (m128) veorq_s8((int8x16_t)a, (int8x16_t)b);
+}
+
+static really_inline m128 or128(m128 a, m128 b) {
+    return (m128) vorrq_s8((int8x16_t)a, (int8x16_t)b);
+}
+
+static really_inline m128 andnot128(m128 a, m128 b) {
+    return (m128) vbicq_u32((uint32x4_t)a, (uint32x4_t)b);
+}
+
+// aligned load
+static really_inline m128 load128(const void *ptr) {
+    assert(ISALIGNED_N(ptr, alignof(m128)));
+    ptr = assume_aligned(ptr, 16);
+    return (m128) vld1q_s32((const int32_t *)ptr);
+}
+
+// aligned store
+static really_inline void store128(void *ptr, m128 a) {
+    assert(ISALIGNED_N(ptr, alignof(m128)));
+    ptr = assume_aligned(ptr, 16);
+    vst1q_s32((int32_t *)ptr, a);
+}
+
+// unaligned load
+static really_inline m128 loadu128(const void *ptr) {
+    return (m128) vld1q_s32((const int32_t *)ptr);
+}
+
+// unaligned store
+static really_inline void storeu128(void *ptr, m128 a) {
+    vst1q_s32((int32_t *)ptr, a);
+}
+
+// packed unaligned store of first N bytes
+static really_inline
+void storebytes128(void *ptr, m128 a, unsigned int n) {
+    assert(n <= sizeof(a));
+    memcpy(ptr, &a, n);
+}
+
+// packed unaligned load of first N bytes, pad with zero
+static really_inline
+m128 loadbytes128(const void *ptr, unsigned int n) {
+    m128 a = zeroes128();
+    assert(n <= sizeof(a));
+    memcpy(&a, ptr, n);
+    return a;
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern const u8 simd_onebit_masks[];
+#ifdef __cplusplus
+}
+#endif
+
+static really_inline
+m128 mask1bit128(unsigned int n) {
+    assert(n < sizeof(m128) * 8);
+    u32 mask_idx = ((n % 8) * 64) + 95;
+    mask_idx -= n / 8;
+    return loadu128(&simd_onebit_masks[mask_idx]);
+}
+
+// switches on bit N in the given vector.
+static really_inline
+void setbit128(m128 *ptr, unsigned int n) {
+    *ptr = or128(mask1bit128(n), *ptr);
+}
+
+// switches off bit N in the given vector.
+static really_inline
+void clearbit128(m128 *ptr, unsigned int n) {
+    *ptr = andnot128(mask1bit128(n), *ptr);
+}
+
+// tests bit N in the given vector.
+static really_inline
+char testbit128(m128 val, unsigned int n) {
+    const m128 mask = mask1bit128(n);
+#if defined(HAVE_SSE41)
+    return !_mm_testz_si128(mask, val);
+#else
+    return isnonzero128(and128(mask, val));
+#endif
+}
+
+// offset must be an immediate
+#define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset)
+
+static really_inline
+m128 pshufb_m128(m128 a, m128 b) {
+    m128 result;
+    result = _mm_shuffle_epi8(a, b);
+    return result;
+}
+
+static really_inline
+m128 variable_byte_shift_m128(m128 in, s32 amount) {
+    assert(amount >= -16 && amount <= 16);
+    m128 shift_mask = loadu128(vbs_mask_data + 16 - amount);
+    return pshufb_m128(in, shift_mask);
+}
+
+static really_inline
+m128 max_u8_m128(m128 a, m128 b) {
+    return (m128) vmaxq_s8((int8x16_t)a, (int8x16_t)b);
+}
+
+static really_inline
+m128 min_u8_m128(m128 a, m128 b) {
+    return (m128) vminq_s8((int8x16_t)a, (int8x16_t)b);
+}
+
+static really_inline
+m128 sadd_u8_m128(m128 a, m128 b) {
+    return (m128) vqaddq_u8((uint8x16_t)a, (uint8x16_t)b);
+}
+
+static really_inline
+m128 sub_u8_m128(m128 a, m128 b) {
+    return (m128) vsubq_u8((uint8x16_t)a, (uint8x16_t)b);
+}
+
+static really_inline
+m128 set4x32(u32 x3, u32 x2, u32 x1, u32 x0) {
+    uint32_t __attribute__((aligned(16))) data[4] = { x3, x2, x1, x0 };
+    return (m128) vld1q_u32((uint32_t *) data);
+}
+
+static really_inline
+m128 set2x64(u64a hi, u64a lo) {
+    uint64_t __attribute__((aligned(16))) data[2] = { hi, lo };
+    return (m128) vld1q_u64((uint64_t *) data);
+}
+
+#endif // ARCH_ARM_SIMD_UTILS_H
diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h
index 019dc125..49200288 100644
--- a/src/util/simd_utils.h
+++ b/src/util/simd_utils.h
@@ -63,6 +63,8 @@ extern const char vbs_mask_data[];
 
 #if defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "util/arch/x86/simd_utils.h"
+#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
+#include "util/arch/arm/simd_utils.h"
 #endif
 
 #endif // SIMD_UTILS_H

From 5b425bd5a6752d239ebe5957dc90bb22bfc37e2e Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Thu, 15 Oct 2020 16:25:29 +0300
Subject: [PATCH 25/53] add arm simple cpuid_flags

---
 CMakeLists.txt                  | 13 ++++++++++-
 src/hs_valid_platform.c         |  5 +++++
 src/util/arch/arm/cpuid_flags.c | 40 +++++++++++++++++++++++++++++++++
 3 files changed, 57 insertions(+), 1 deletion(-)
 create mode 100644 src/util/arch/arm/cpuid_flags.c

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c1db4dfa..566a7dcd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -568,11 +568,22 @@ install(FILES ${hs_HEADERS} DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/hs")
 set (hs_exec_common_SRCS
     src/alloc.c
     src/scratch.c
-    src/util/arch/x86/cpuid_flags.c
     src/util/arch/common/cpuid_flags.h
     src/util/multibit.c
     )
 
+if (ARCH_IA32 OR ARCH_X86_64)
+set (hs_exec_common_SRCS
+    ${hs_exec_common_SRCS}
+    src/util/arch/x86/cpuid_flags.c
+    )
+elif (ARCH_ARM32 OR ARCH_AARCH64)
+set (hs_exec_common_SRCS
+    ${hs_exec_common_SRCS}
+    src/util/arch/arm/cpuid_flags.c
+    )
+endif ()
+
 set (hs_exec_SRCS
     ${hs_HEADERS}
     src/hs_version.h
diff --git a/src/hs_valid_platform.c b/src/hs_valid_platform.c
index 7a022607..b187090b 100644
--- a/src/hs_valid_platform.c
+++ b/src/hs_valid_platform.c
@@ -28,6 +28,7 @@
 
 #include "config.h"
 #include "hs_common.h"
+#include "ue2common.h"
 #if defined(ARCH_X86_64)
 #include "util/arch/x86/cpuid_inline.h"
 #endif
@@ -35,9 +36,13 @@
 HS_PUBLIC_API
 hs_error_t HS_CDECL hs_valid_platform(void) {
     /* Hyperscan requires SSSE3, anything else is a bonus */
+#if defined(ARCH_IA32) || defined(ARCH_X86_64)
     if (check_ssse3()) {
         return HS_SUCCESS;
     } else {
         return HS_ARCH_ERROR;
     }
+#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
+    return HS_SUCCESS;
+#endif
 }
diff --git a/src/util/arch/arm/cpuid_flags.c b/src/util/arch/arm/cpuid_flags.c
new file mode 100644
index 00000000..8dbab473
--- /dev/null
+++ b/src/util/arch/arm/cpuid_flags.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "cpuid_flags.h"
+#include "ue2common.h"
+#include "hs_compile.h" // for HS_MODE_ flags
+#include "util/arch.h"
+
+u64a cpuid_flags(void) {
+     return cap;
+}
+
+u32 cpuid_tune(void) {
+    return HS_TUNE_FAMILY_GENERIC;
+}

From c5a7f4b846edd8c6811fccae54a7df7ceabf52cf Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Thu, 15 Oct 2020 16:26:49 +0300
Subject: [PATCH 26/53] add ARM simd_utils vectorized functions for 128-bit
 vectors

---
 src/util/arch/arm/simd_utils.h | 32 ++++++++++++++++++++++----------
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h
index 606892fb..74f447fb 100644
--- a/src/util/arch/arm/simd_utils.h
+++ b/src/util/arch/arm/simd_utils.h
@@ -95,7 +95,18 @@ static really_inline m128 eq128(m128 a, m128 b) {
     return (m128) vceqq_s8((int8x16_t)a, (int8x16_t)b);
 }
 
-#define movemask128(a)  ((u32)_mm_movemask_epi8((a)))
+static really_inline u32 movemask128(m128 a) {
+    static const uint8x16_t powers = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 };
+
+    // Compute the mask from the input
+    uint64x2_t mask= vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8((uint8x16_t)a, powers))));
+
+    // Get the resulting bytes
+    uint16_t output;
+    vst1q_lane_u8((uint8_t*)&output + 0, (uint8x16_t)mask, 0);
+    vst1q_lane_u8((uint8_t*)&output + 1, (uint8x16_t)mask, 8);
+    return output;
+}
 
 static really_inline m128 set1_16x8(u8 c) {
     return (m128) vdupq_n_u8(c);
@@ -229,21 +240,22 @@ void clearbit128(m128 *ptr, unsigned int n) {
 static really_inline
 char testbit128(m128 val, unsigned int n) {
     const m128 mask = mask1bit128(n);
-#if defined(HAVE_SSE41)
-    return !_mm_testz_si128(mask, val);
-#else
+
     return isnonzero128(and128(mask, val));
-#endif
 }
 
-// offset must be an immediate
-#define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset)
+static really_inline
+m128 palignr(m128 r, m128 l, int offset) {
+    return (m128)vextq_s8((int8x16_t)l, (int8x16_t)r, offset);
+}
 
 static really_inline
 m128 pshufb_m128(m128 a, m128 b) {
-    m128 result;
-    result = _mm_shuffle_epi8(a, b);
-    return result;
+    /* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf.
+       In NEON, if >=16, then the result is zero, otherwise it is that lane.
+       btranslated is the version that is converted from Intel to NEON.  */
+    int8x16_t btranslated = vandq_s8((int8x16_t)b,vdupq_n_s8(0x8f));
+    return (m128)vqtbl1q_s8((int8x16_t)a, (uint8x16_t)btranslated);
 }
 
 static really_inline

From 45bfed9b9d22e172b82659d07d63e0a2802b2fa4 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Thu, 15 Oct 2020 16:30:18 +0300
Subject: [PATCH 27/53] add scalar versions of the vectorized functions for
 architectures that don't support 256-bit/512-bit SIMD vectors such as ARM

---
 src/util/arch/common/simd_utils.h | 753 ++++++++++++++++++++++++++++++
 src/util/simd_utils.h             |   2 +
 2 files changed, 755 insertions(+)
 create mode 100644 src/util/arch/common/simd_utils.h

diff --git a/src/util/arch/common/simd_utils.h b/src/util/arch/common/simd_utils.h
new file mode 100644
index 00000000..e682e2d5
--- /dev/null
+++ b/src/util/arch/common/simd_utils.h
@@ -0,0 +1,753 @@
+/*
+ * Copyright (c) 2015-2020, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief SIMD types and primitive operations.
+ */
+
+#ifndef ARCH_COMMON_SIMD_UTILS_H
+#define ARCH_COMMON_SIMD_UTILS_H
+
+#include "ue2common.h"
+#include "util/simd_types.h"
+#include "util/unaligned.h"
+#include "util/intrinsics.h"
+
+#include <string.h> // for memcpy
+
+#if !defined(HAVE_SIMD_128_BITS)
+#error "You need at least a 128-bit capable SIMD engine!"
+#endif // HAVE_SIMD_128_BITS
+
+/****
+ **** 256-bit Primitives
+ ****/
+
+#if !defined(HAVE_SIMD_256_BITS)
+
+static really_really_inline
+m256 lshift64_m256(m256 a, int b) {
+    m256 rv = a;
+    rv.lo = lshift64_m128(rv.lo, b);
+    rv.hi = lshift64_m128(rv.hi, b);
+    return rv;
+}
+
+static really_inline
+m256 rshift64_m256(m256 a, int b) {
+    m256 rv = a;
+    rv.lo = rshift64_m128(rv.lo, b);
+    rv.hi = rshift64_m128(rv.hi, b);
+    return rv;
+}
+
+static really_inline
+m256 eq256(m256 a, m256 b) {
+    m256 rv;
+    rv.lo = eq128(a.lo, b.lo);
+    rv.hi = eq128(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline
+u32 movemask256(m256 a) {
+    u32 lo_mask = movemask128(a.lo);
+    u32 hi_mask = movemask128(a.hi);
+    return lo_mask | (hi_mask << 16);
+}
+
+static really_inline m256 set1_4x64(u64a c) {
+    m128 a128 = set1_2x64(c);
+    m256 rv = {a128, a128};
+    return rv;
+}
+
+static really_inline
+m256 set1_2x128(m128 a) {
+    m256 rv = {a, a};
+    return rv;
+}
+
+static really_inline m256 zeroes256(void) {
+    m256 rv = {zeroes128(), zeroes128()};
+    return rv;
+}
+
+static really_inline m256 ones256(void) {
+    m256 rv = {ones128(), ones128()};
+    return rv;
+}
+
+static really_inline m256 and256(m256 a, m256 b) {
+    m256 rv;
+    rv.lo = and128(a.lo, b.lo);
+    rv.hi = and128(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline m256 or256(m256 a, m256 b) {
+    m256 rv;
+    rv.lo = or128(a.lo, b.lo);
+    rv.hi = or128(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline m256 xor256(m256 a, m256 b) {
+    m256 rv;
+    rv.lo = xor128(a.lo, b.lo);
+    rv.hi = xor128(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline m256 not256(m256 a) {
+    m256 rv;
+    rv.lo = not128(a.lo);
+    rv.hi = not128(a.hi);
+    return rv;
+}
+
+static really_inline m256 andnot256(m256 a, m256 b) {
+    m256 rv;
+    rv.lo = andnot128(a.lo, b.lo);
+    rv.hi = andnot128(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline int diff256(m256 a, m256 b) {
+    return diff128(a.lo, b.lo) || diff128(a.hi, b.hi);
+}
+
+static really_inline int isnonzero256(m256 a) {
+    return isnonzero128(or128(a.lo, a.hi));
+}
+
+/**
+ * "Rich" version of diff256(). Takes two vectors a and b and returns an 8-bit
+ * mask indicating which 32-bit words contain differences.
+ */
+static really_inline u32 diffrich256(m256 a, m256 b) {
+}
+
+/**
+ * "Rich" version of diff256(), 64-bit variant. Takes two vectors a and b and
+ * returns an 8-bit mask indicating which 64-bit words contain differences.
+ */
+static really_inline u32 diffrich64_256(m256 a, m256 b) {
+    u32 d = diffrich256(a, b);
+    return (d | (d >> 1)) & 0x55555555;
+}
+
+// aligned load
+static really_inline m256 load256(const void *ptr) {
+    assert(ISALIGNED_N(ptr, alignof(m256)));
+    m256 rv = { load128(ptr), load128((const char *)ptr + 16) };
+    return rv;
+}
+
+// aligned load  of 128-bit value to low and high part of 256-bit value
+static really_inline m256 load2x128(const void *ptr) {
+    return set1_2x128(load128(ptr));
+}
+
+static really_inline m256 loadu2x128(const void *ptr) {
+    return set1_2x128(loadu128(ptr));
+}
+
+// aligned store
+static really_inline void store256(void *ptr, m256 a) {
+    assert(ISALIGNED_N(ptr, alignof(m256)));
+    ptr = assume_aligned(ptr, 16);
+    *(m256 *)ptr = a;
+}
+
+// unaligned load
+static really_inline m256 loadu256(const void *ptr) {
+    m256 rv = { loadu128(ptr), loadu128((const char *)ptr + 16) };
+    return rv;
+}
+
+// unaligned store
+static really_inline void storeu256(void *ptr, m256 a) {
+    storeu128(ptr, a.lo);
+    storeu128((char *)ptr + 16, a.hi);
+}
+
+// packed unaligned store of first N bytes
+static really_inline
+void storebytes256(void *ptr, m256 a, unsigned int n) {
+    assert(n <= sizeof(a));
+    memcpy(ptr, &a, n);
+}
+
+// packed unaligned load of first N bytes, pad with zero
+static really_inline
+m256 loadbytes256(const void *ptr, unsigned int n) {
+    m256 a = zeroes256();
+    assert(n <= sizeof(a));
+    memcpy(&a, ptr, n);
+    return a;
+}
+
+static really_inline
+m256 mask1bit256(unsigned int n) {
+    assert(n < sizeof(m256) * 8);
+    u32 mask_idx = ((n % 8) * 64) + 95;
+    mask_idx -= n / 8;
+    return loadu256(&simd_onebit_masks[mask_idx]);
+}
+
+static really_inline
+m256 set1_32x8(u32 in) {
+    m256 rv;
+    rv.hi = set1_16x8(in);
+    rv.lo = set1_16x8(in);
+    return rv;
+}
+
+static really_inline
+m256 set8x32(u32 hi_3, u32 hi_2, u32 hi_1, u32 hi_0, u32 lo_3, u32 lo_2, u32 lo_1, u32 lo_0) {
+    m256 rv;
+    rv.hi = set4x32(hi_3, hi_2, hi_1, hi_0);
+    rv.lo = set4x32(lo_3, lo_2, lo_1, lo_0);
+    return rv;
+}
+
+static really_inline
+m256 set4x64(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) {
+    m256 rv;
+    rv.hi = set2x64(hi_1, hi_0);
+    rv.lo = set2x64(lo_1, lo_0);
+    return rv;
+}
+
+// switches on bit N in the given vector.
+static really_inline
+void setbit256(m256 *ptr, unsigned int n) {
+    assert(n < sizeof(*ptr) * 8);
+    m128 *sub;
+    if (n < 128) {
+        sub = &ptr->lo;
+    } else {
+        sub = &ptr->hi;
+        n -= 128;
+    }
+    setbit128(sub, n);
+}
+
+// switches off bit N in the given vector.
+static really_inline
+void clearbit256(m256 *ptr, unsigned int n) {
+    assert(n < sizeof(*ptr) * 8);
+    m128 *sub;
+    if (n < 128) {
+        sub = &ptr->lo;
+    } else {
+        sub = &ptr->hi;
+        n -= 128;
+    }
+    clearbit128(sub, n);
+}
+
+// tests bit N in the given vector.
+static really_inline
+char testbit256(m256 val, unsigned int n) {
+    assert(n < sizeof(val) * 8);
+    m128 sub;
+    if (n < 128) {
+        sub = val.lo;
+    } else {
+        sub = val.hi;
+        n -= 128;
+    }
+    return testbit128(sub, n);
+}
+
+static really_really_inline
+m128 movdq_hi(m256 x) {
+    return x.hi;
+}
+
+static really_really_inline
+m128 movdq_lo(m256 x) {
+    return x.lo;
+}
+
+static really_inline
+m256 combine2x128(m128 hi, m128 lo) {
+    m256 rv = {lo, hi};
+    return rv;
+}
+
+static really_inline
+m256 pshufb_m256(m256 a, m256 b) {
+    m256 rv;
+    rv.lo = pshufb_m128(a.lo, b.lo);
+    rv.hi = pshufb_m128(a.hi, b.hi);
+    return rv;
+}
+
+#define cast256to128(a) _mm256_castsi256_si128(a)
+#define cast128to256(a) _mm256_castsi128_si256(a)
+#define swap128in256(a) _mm256_permute4x64_epi64(a, 0x4E)
+#define insert128to256(a, b, imm) _mm256_inserti128_si256(a, b, imm)
+#define rshift128_m256(a, count_immed) _mm256_srli_si256(a, count_immed)
+#define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed)
+#define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2)
+#define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4)
+#define extractlow64from256(a) _mm_cvtsi128_si64(cast256to128(a))
+#define extractlow32from256(a) movd(cast256to128(a))
+#define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b)
+#define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b)
+#define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset)
+
+#define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm)
+#define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b)
+#define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b)
+#define set2x256(a) _mm512_broadcast_i64x4(a)
+#define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a)
+
+#endif // HAVE_SIMD_256_BITS
+
+/****
+ **** 384-bit Primitives
+ ****/
+
+static really_inline m384 and384(m384 a, m384 b) {
+    m384 rv;
+    rv.lo = and128(a.lo, b.lo);
+    rv.mid = and128(a.mid, b.mid);
+    rv.hi = and128(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline m384 or384(m384 a, m384 b) {
+    m384 rv;
+    rv.lo = or128(a.lo, b.lo);
+    rv.mid = or128(a.mid, b.mid);
+    rv.hi = or128(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline m384 xor384(m384 a, m384 b) {
+    m384 rv;
+    rv.lo = xor128(a.lo, b.lo);
+    rv.mid = xor128(a.mid, b.mid);
+    rv.hi = xor128(a.hi, b.hi);
+    return rv;
+}
+static really_inline m384 not384(m384 a) {
+    m384 rv;
+    rv.lo = not128(a.lo);
+    rv.mid = not128(a.mid);
+    rv.hi = not128(a.hi);
+    return rv;
+}
+static really_inline m384 andnot384(m384 a, m384 b) {
+    m384 rv;
+    rv.lo = andnot128(a.lo, b.lo);
+    rv.mid = andnot128(a.mid, b.mid);
+    rv.hi = andnot128(a.hi, b.hi);
+    return rv;
+}
+
+static really_really_inline
+m384 lshift64_m384(m384 a, unsigned b) {
+    m384 rv;
+    rv.lo = lshift64_m128(a.lo, b);
+    rv.mid = lshift64_m128(a.mid, b);
+    rv.hi = lshift64_m128(a.hi, b);
+    return rv;
+}
+
+static really_inline m384 zeroes384(void) {
+    m384 rv = {zeroes128(), zeroes128(), zeroes128()};
+    return rv;
+}
+
+static really_inline m384 ones384(void) {
+    m384 rv = {ones128(), ones128(), ones128()};
+    return rv;
+}
+
+static really_inline int diff384(m384 a, m384 b) {
+    return diff128(a.lo, b.lo) || diff128(a.mid, b.mid) || diff128(a.hi, b.hi);
+}
+
+static really_inline int isnonzero384(m384 a) {
+    return isnonzero128(or128(or128(a.lo, a.mid), a.hi));
+}
+
+/**
+ * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit
+ * mask indicating which 32-bit words contain differences.
+ */
+static really_inline u32 diffrich384(m384 a, m384 b) {
+}
+
+/**
+ * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and
+ * returns a 12-bit mask indicating which 64-bit words contain differences.
+ */
+static really_inline u32 diffrich64_384(m384 a, m384 b) {
+    u32 d = diffrich384(a, b);
+    return (d | (d >> 1)) & 0x55555555;
+}
+
+// aligned load
+static really_inline m384 load384(const void *ptr) {
+    assert(ISALIGNED_16(ptr));
+    m384 rv = { load128(ptr), load128((const char *)ptr + 16),
+                load128((const char *)ptr + 32) };
+    return rv;
+}
+
+// aligned store
+static really_inline void store384(void *ptr, m384 a) {
+    assert(ISALIGNED_16(ptr));
+    ptr = assume_aligned(ptr, 16);
+    *(m384 *)ptr = a;
+}
+
+// unaligned load
+static really_inline m384 loadu384(const void *ptr) {
+    m384 rv = { loadu128(ptr), loadu128((const char *)ptr + 16),
+                loadu128((const char *)ptr + 32)};
+    return rv;
+}
+
+// packed unaligned store of first N bytes
+static really_inline
+void storebytes384(void *ptr, m384 a, unsigned int n) {
+    assert(n <= sizeof(a));
+    memcpy(ptr, &a, n);
+}
+
+// packed unaligned load of first N bytes, pad with zero
+static really_inline
+m384 loadbytes384(const void *ptr, unsigned int n) {
+    m384 a = zeroes384();
+    assert(n <= sizeof(a));
+    memcpy(&a, ptr, n);
+    return a;
+}
+
+// switches on bit N in the given vector.
+static really_inline
+void setbit384(m384 *ptr, unsigned int n) {
+    assert(n < sizeof(*ptr) * 8);
+    m128 *sub;
+    if (n < 128) {
+        sub = &ptr->lo;
+    } else if (n < 256) {
+        sub = &ptr->mid;
+    } else {
+        sub = &ptr->hi;
+    }
+    setbit128(sub, n % 128);
+}
+
+// switches off bit N in the given vector.
+static really_inline
+void clearbit384(m384 *ptr, unsigned int n) {
+    assert(n < sizeof(*ptr) * 8);
+    m128 *sub;
+    if (n < 128) {
+        sub = &ptr->lo;
+    } else if (n < 256) {
+        sub = &ptr->mid;
+    } else {
+        sub = &ptr->hi;
+    }
+    clearbit128(sub, n % 128);
+}
+
+// tests bit N in the given vector.
+static really_inline
+char testbit384(m384 val, unsigned int n) {
+    assert(n < sizeof(val) * 8);
+    m128 sub;
+    if (n < 128) {
+        sub = val.lo;
+    } else if (n < 256) {
+        sub = val.mid;
+    } else {
+        sub = val.hi;
+    }
+    return testbit128(sub, n % 128);
+}
+
+
+/****
+ **** 512-bit Primitives
+ ****/
+
+#if !defined(HAVE_SIMD_512_BITS)
+#define eq512mask(a, b) _mm512_cmpeq_epi8_mask((a), (b))
+#define masked_eq512mask(k, a, b) _mm512_mask_cmpeq_epi8_mask((k), (a), (b))
+#define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a)
+
+static really_inline
+m512 zeroes512(void) {
+    m512 rv = {zeroes256(), zeroes256()};
+    return rv;
+}
+
+static really_inline
+m512 ones512(void) {
+    m512 rv = {ones256(), ones256()};
+    return rv;
+}
+
+static really_inline
+m512 set1_64x8(u8 a) {
+    m256 a256 = set1_32x8(a);
+    m512 rv = {a256, a256};
+    return rv;
+}
+
+static really_inline
+m512 set1_8x64(u64a a) {
+    m256 a256 = set1_4x64(a);
+    m512 rv = {a256, a256};
+    return rv;
+}
+
+static really_inline
+m512 set8x64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0,
+               u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) {
+    m512 rv;
+    rv.lo = set4x64(lo_3, lo_2, lo_1, lo_0);
+    rv.hi = set4x64(hi_3, hi_2, hi_1, hi_0);
+    return rv;
+}
+/*
+static really_inline
+m512 swap256in512(m512 a) {
+    m512 idx = set8x64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL);
+    return vpermq512(idx, a);
+}*/
+
+static really_inline
+m512 set1_4x128(m128 a) {
+    m256 a256 = set1_2x128(a);
+    m512 rv = {a256, a256};
+    return rv;
+}
+
+
+static really_inline
+m512 and512(m512 a, m512 b) {
+    m512 rv;
+    rv.lo = and256(a.lo, b.lo);
+    rv.hi = and256(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline
+m512 or512(m512 a, m512 b) {
+    m512 rv;
+    rv.lo = or256(a.lo, b.lo);
+    rv.hi = or256(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline
+m512 xor512(m512 a, m512 b) {
+    m512 rv;
+    rv.lo = xor256(a.lo, b.lo);
+    rv.hi = xor256(a.hi, b.hi);
+    return rv;
+}
+
+static really_inline
+m512 not512(m512 a) {
+    m512 rv;
+    rv.lo = not256(a.lo);
+    rv.hi = not256(a.hi);
+    return rv;
+}
+
+static really_inline
+m512 andnot512(m512 a, m512 b) {
+    m512 rv;
+    rv.lo = andnot256(a.lo, b.lo);
+    rv.hi = andnot256(a.hi, b.hi);
+    return rv;
+}
+
+static really_really_inline
+m512 lshift64_m512(m512 a, unsigned b) {
+    m512 rv;
+    rv.lo = lshift64_m256(a.lo, b);
+    rv.hi = lshift64_m256(a.hi, b);
+    return rv;
+}
+
+#if defined(HAVE_AVX512)
+#define rshift64_m512(a, b) _mm512_srli_epi64((a), (b))
+#define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed)
+#define lshift128_m512(a, count_immed) _mm512_bslli_epi128(a, count_immed)
+#endif
+
+static really_inline
+int diff512(m512 a, m512 b) {
+    return diff256(a.lo, b.lo) || diff256(a.hi, b.hi);
+}
+
+static really_inline
+int isnonzero512(m512 a) {
+    m128 x = or128(a.lo.lo, a.lo.hi);
+    m128 y = or128(a.hi.lo, a.hi.hi);
+    return isnonzero128(or128(x, y));
+}
+
+/**
+ * "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit
+ * mask indicating which 32-bit words contain differences.
+ */
+static really_inline
+u32 diffrich512(m512 a, m512 b) {
+    return diffrich256(a.lo, b.lo) | (diffrich256(a.hi, b.hi) << 8);
+}
+
+/**
+ * "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and
+ * returns a 16-bit mask indicating which 64-bit words contain differences.
+ */
+static really_inline
+u32 diffrich64_512(m512 a, m512 b) {
+    //TODO: cmp_epi64?
+    u32 d = diffrich512(a, b);
+    return (d | (d >> 1)) & 0x55555555;
+}
+
+// aligned load
+static really_inline
+m512 load512(const void *ptr) {
+    assert(ISALIGNED_N(ptr, alignof(m256)));
+    m512 rv = { load256(ptr), load256((const char *)ptr + 32) };
+    return rv;
+}
+
+// aligned store
+static really_inline
+void store512(void *ptr, m512 a) {
+    assert(ISALIGNED_N(ptr, alignof(m512)));
+    m512 *x = (m512 *)ptr;
+    store256(&x->lo, a.lo);
+    store256(&x->hi, a.hi);
+}
+
+// unaligned load
+static really_inline
+m512 loadu512(const void *ptr) {
+    m512 rv = { loadu256(ptr), loadu256((const char *)ptr + 32) };
+    return rv;
+}
+
+/*static really_inline
+m512 loadu_maskz_m512(__mmask64 k, const void *ptr) {
+}
+
+static really_inline
+m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) {
+}
+
+static really_inline
+m512 set_mask_m512(__mmask64 k) {
+}*/
+
+// packed unaligned store of first N bytes
+static really_inline
+void storebytes512(void *ptr, m512 a, unsigned int n) {
+    assert(n <= sizeof(a));
+    memcpy(ptr, &a, n);
+}
+
+// packed unaligned load of first N bytes, pad with zero
+static really_inline
+m512 loadbytes512(const void *ptr, unsigned int n) {
+    m512 a = zeroes512();
+    assert(n <= sizeof(a));
+    memcpy(&a, ptr, n);
+    return a;
+}
+
+static really_inline
+m512 mask1bit512(unsigned int n) {
+    assert(n < sizeof(m512) * 8);
+    u32 mask_idx = ((n % 8) * 64) + 95;
+    mask_idx -= n / 8;
+    return loadu512(&simd_onebit_masks[mask_idx]);
+}
+
+// switches on bit N in the given vector.
+static really_inline
+void setbit512(m512 *ptr, unsigned int n) {
+    assert(n < sizeof(*ptr) * 8);
+    m256 *sub;
+    if (n < 256) {
+        sub = &ptr->lo;
+    } else {
+        sub = &ptr->hi;
+        n -= 256;
+    }
+    setbit256(sub, n);
+}
+
+// switches off bit N in the given vector.
+static really_inline
+void clearbit512(m512 *ptr, unsigned int n) {
+    assert(n < sizeof(*ptr) * 8);
+    m256 *sub;
+    if (n < 256) {
+        sub = &ptr->lo;
+    } else {
+        sub = &ptr->hi;
+        n -= 256;
+    }
+    clearbit256(sub, n);
+}
+
+// tests bit N in the given vector.
+static really_inline
+char testbit512(m512 val, unsigned int n) {
+    assert(n < sizeof(val) * 8);
+    m256 sub;
+    if (n < 256) {
+        sub = val.lo;
+    } else {
+        sub = val.hi;
+        n -= 256;
+    }
+    return testbit256(sub, n);
+}
+
+#endif // HAVE_SIMD_512_BITS
+
+#endif // ARCH_COMMON_SIMD_UTILS_H
diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h
index 49200288..0724c94e 100644
--- a/src/util/simd_utils.h
+++ b/src/util/simd_utils.h
@@ -67,4 +67,6 @@ extern const char vbs_mask_data[];
 #include "util/arch/arm/simd_utils.h"
 #endif
 
+#include "util/arch/common/simd_utils.h"
+
 #endif // SIMD_UTILS_H

From e7e1308d7f709e6e6665db9ef042b7e335714198 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <markos@freevec.org>
Date: Fri, 16 Oct 2020 12:29:45 +0300
Subject: [PATCH 28/53] fix compilation paths for cpuid_flags for x86

---
 CMakeLists.txt                   | 2 +-
 src/util/arch/x86/cpuid_flags.c  | 2 +-
 src/util/arch/x86/cpuid_inline.h | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 566a7dcd..4077d396 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -577,7 +577,7 @@ set (hs_exec_common_SRCS
     ${hs_exec_common_SRCS}
     src/util/arch/x86/cpuid_flags.c
     )
-elif (ARCH_ARM32 OR ARCH_AARCH64)
+else (ARCH_ARM32 OR ARCH_AARCH64)
 set (hs_exec_common_SRCS
     ${hs_exec_common_SRCS}
     src/util/arch/arm/cpuid_flags.c
diff --git a/src/util/arch/x86/cpuid_flags.c b/src/util/arch/x86/cpuid_flags.c
index 0b529c0b..81c7e456 100644
--- a/src/util/arch/x86/cpuid_flags.c
+++ b/src/util/arch/x86/cpuid_flags.c
@@ -26,7 +26,7 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "cpuid_flags.h"
+#include "util/arch/common/cpuid_flags.h"
 #include "cpuid_inline.h"
 #include "ue2common.h"
 #include "hs_compile.h" // for HS_MODE_ flags
diff --git a/src/util/arch/x86/cpuid_inline.h b/src/util/arch/x86/cpuid_inline.h
index b6768cc2..97f19aed 100644
--- a/src/util/arch/x86/cpuid_inline.h
+++ b/src/util/arch/x86/cpuid_inline.h
@@ -30,7 +30,7 @@
 #define CPUID_INLINE_H_
 
 #include "ue2common.h"
-#include "cpuid_flags.h"
+#include "util/arch/common/cpuid_flags.h"
 
 #if !defined(_WIN32) && !defined(CPUID_H_)
 #include <cpuid.h>

From 83977db7abfd871f3fb2a37ee8534f46aa4cd994 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <markos@freevec.org>
Date: Fri, 16 Oct 2020 12:30:34 +0300
Subject: [PATCH 29/53] split arch-agnostic simd_utils.h functions into the
 common file

---
 src/util/arch/common/simd_utils.h |  48 +--
 src/util/arch/x86/simd_utils.h    | 644 ++----------------------------
 2 files changed, 48 insertions(+), 644 deletions(-)

diff --git a/src/util/arch/common/simd_utils.h b/src/util/arch/common/simd_utils.h
index e682e2d5..56d9dbaf 100644
--- a/src/util/arch/common/simd_utils.h
+++ b/src/util/arch/common/simd_utils.h
@@ -147,10 +147,12 @@ static really_inline int isnonzero256(m256 a) {
 }
 
 /**
- * "Rich" version of diff256(). Takes two vectors a and b and returns an 8-bit
+ * "Rich" version of diff256(). Takes two vectors a and b and returns a 8-bit
  * mask indicating which 32-bit words contain differences.
  */
-static really_inline u32 diffrich256(m256 a, m256 b) {
+static really_inline
+u32 diffrich256(m256 a, m256 b) {
+    return diffrich128(a.lo, b.lo) | (diffrich128(a.hi, b.hi) << 8);
 }
 
 /**
@@ -311,26 +313,6 @@ m256 pshufb_m256(m256 a, m256 b) {
     return rv;
 }
 
-#define cast256to128(a) _mm256_castsi256_si128(a)
-#define cast128to256(a) _mm256_castsi128_si256(a)
-#define swap128in256(a) _mm256_permute4x64_epi64(a, 0x4E)
-#define insert128to256(a, b, imm) _mm256_inserti128_si256(a, b, imm)
-#define rshift128_m256(a, count_immed) _mm256_srli_si256(a, count_immed)
-#define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed)
-#define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2)
-#define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4)
-#define extractlow64from256(a) _mm_cvtsi128_si64(cast256to128(a))
-#define extractlow32from256(a) movd(cast256to128(a))
-#define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b)
-#define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b)
-#define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset)
-
-#define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm)
-#define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b)
-#define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b)
-#define set2x256(a) _mm512_broadcast_i64x4(a)
-#define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a)
-
 #endif // HAVE_SIMD_256_BITS
 
 /****
@@ -402,13 +384,6 @@ static really_inline int isnonzero384(m384 a) {
     return isnonzero128(or128(or128(a.lo, a.mid), a.hi));
 }
 
-/**
- * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit
- * mask indicating which 32-bit words contain differences.
- */
-static really_inline u32 diffrich384(m384 a, m384 b) {
-}
-
 /**
  * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and
  * returns a 12-bit mask indicating which 64-bit words contain differences.
@@ -507,9 +482,6 @@ char testbit384(m384 val, unsigned int n) {
  ****/
 
 #if !defined(HAVE_SIMD_512_BITS)
-#define eq512mask(a, b) _mm512_cmpeq_epi8_mask((a), (b))
-#define masked_eq512mask(k, a, b) _mm512_mask_cmpeq_epi8_mask((k), (a), (b))
-#define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a)
 
 static really_inline
 m512 zeroes512(void) {
@@ -608,12 +580,6 @@ m512 lshift64_m512(m512 a, unsigned b) {
     return rv;
 }
 
-#if defined(HAVE_AVX512)
-#define rshift64_m512(a, b) _mm512_srli_epi64((a), (b))
-#define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed)
-#define lshift128_m512(a, count_immed) _mm512_bslli_epi128(a, count_immed)
-#endif
-
 static really_inline
 int diff512(m512 a, m512 b) {
     return diff256(a.lo, b.lo) || diff256(a.hi, b.hi);
@@ -621,9 +587,9 @@ int diff512(m512 a, m512 b) {
 
 static really_inline
 int isnonzero512(m512 a) {
-    m128 x = or128(a.lo.lo, a.lo.hi);
-    m128 y = or128(a.hi.lo, a.hi.hi);
-    return isnonzero128(or128(x, y));
+    m256 x = or256(a.lo, a.lo);
+    m256 y = or256(a.hi, a.hi);
+    return isnonzero256(or256(x, y));
 }
 
 /**
diff --git a/src/util/arch/x86/simd_utils.h b/src/util/arch/x86/simd_utils.h
index 2d099f56..4a1a691e 100644
--- a/src/util/arch/x86/simd_utils.h
+++ b/src/util/arch/x86/simd_utils.h
@@ -127,22 +127,8 @@ static really_inline u32 movd(const m128 in) {
     return _mm_cvtsi128_si32(in);
 }
 
-#if defined(HAVE_AVX512)
-static really_inline u32 movd512(const m512 in) {
-    // NOTE: seems gcc doesn't support _mm512_cvtsi512_si32(in),
-    //       so we use 2-step convertions to work around.
-    return _mm_cvtsi128_si32(_mm512_castsi512_si128(in));
-}
-#endif
-
 static really_inline u64a movq(const m128 in) {
-#if defined(ARCH_X86_64)
     return _mm_cvtsi128_si64(in);
-#else // 32-bit - this is horrific
-    u32 lo = movd(in);
-    u32 hi = movd(_mm_srli_epi64(in, 32));
-    return (u64a)hi << 32 | lo;
-#endif
 }
 
 /* another form of movq */
@@ -281,36 +267,6 @@ m128 pshufb_m128(m128 a, m128 b) {
     return result;
 }
 
-static really_inline
-m256 pshufb_m256(m256 a, m256 b) {
-#if defined(HAVE_AVX2)
-    return _mm256_shuffle_epi8(a, b);
-#else
-    m256 rv;
-    rv.lo = pshufb_m128(a.lo, b.lo);
-    rv.hi = pshufb_m128(a.hi, b.hi);
-    return rv;
-#endif
-}
-
-#if defined(HAVE_AVX512)
-static really_inline
-m512 pshufb_m512(m512 a, m512 b) {
-    return _mm512_shuffle_epi8(a, b);
-}
-
-static really_inline
-m512 maskz_pshufb_m512(__mmask64 k, m512 a, m512 b) {
-    return _mm512_maskz_shuffle_epi8(k, a, b);
-}
-
-#if defined(HAVE_AVX512VBMI)
-#define vpermb512(idx, a) _mm512_permutexvar_epi8(idx, a)
-#define maskz_vpermb512(k, idx, a) _mm512_maskz_permutexvar_epi8(k, idx, a)
-#endif
-
-#endif
-
 static really_inline
 m128 variable_byte_shift_m128(m128 in, s32 amount) {
     assert(amount >= -16 && amount <= 16);
@@ -352,7 +308,12 @@ m128 set2x64(u64a hi, u64a lo) {
  **** 256-bit Primitives
  ****/
 
-#if defined(HAVE_AVX2)
+#if defined(HAVE_SIMD_256_BITS) && defined(HAVE_AVX2)
+
+static really_inline
+m256 pshufb_m256(m256 a, m256 b) {
+    return _mm256_shuffle_epi8(a, b);
+}
 
 static really_really_inline
 m256 lshift64_m256(m256 a, unsigned b) {
@@ -379,143 +340,41 @@ m256 set1_2x128(m128 a) {
     return _mm256_broadcastsi128_si256(a);
 }
 
-#else
-
-static really_really_inline
-m256 lshift64_m256(m256 a, int b) {
-    m256 rv = a;
-    rv.lo = lshift64_m128(rv.lo, b);
-    rv.hi = lshift64_m128(rv.hi, b);
-    return rv;
-}
-
-static really_inline
-m256 rshift64_m256(m256 a, int b) {
-    m256 rv = a;
-    rv.lo = rshift64_m128(rv.lo, b);
-    rv.hi = rshift64_m128(rv.hi, b);
-    return rv;
-}
-
-static really_inline
-m256 eq256(m256 a, m256 b) {
-    m256 rv;
-    rv.lo = eq128(a.lo, b.lo);
-    rv.hi = eq128(a.hi, b.hi);
-    return rv;
-}
-
-static really_inline
-u32 movemask256(m256 a) {
-    u32 lo_mask = movemask128(a.lo);
-    u32 hi_mask = movemask128(a.hi);
-    return lo_mask | (hi_mask << 16);
-}
-
-static really_inline
-m256 set1_2x128(m128 a) {
-    m256 rv = {a, a};
-    return rv;
-}
-#endif
-
 static really_inline m256 zeroes256(void) {
-#if defined(HAVE_AVX2)
     return _mm256_setzero_si256();
-#else
-    m256 rv = {zeroes128(), zeroes128()};
-    return rv;
-#endif
 }
 
 static really_inline m256 ones256(void) {
-#if defined(HAVE_AVX2)
     m256 rv = _mm256_set1_epi8(0xFF);
-#else
-    m256 rv = {ones128(), ones128()};
-#endif
     return rv;
 }
 
-#if defined(HAVE_AVX2)
 static really_inline m256 and256(m256 a, m256 b) {
     return _mm256_and_si256(a, b);
 }
-#else
-static really_inline m256 and256(m256 a, m256 b) {
-    m256 rv;
-    rv.lo = and128(a.lo, b.lo);
-    rv.hi = and128(a.hi, b.hi);
-    return rv;
-}
-#endif
 
-#if defined(HAVE_AVX2)
 static really_inline m256 or256(m256 a, m256 b) {
     return _mm256_or_si256(a, b);
 }
-#else
-static really_inline m256 or256(m256 a, m256 b) {
-    m256 rv;
-    rv.lo = or128(a.lo, b.lo);
-    rv.hi = or128(a.hi, b.hi);
-    return rv;
-}
-#endif
 
-#if defined(HAVE_AVX2)
 static really_inline m256 xor256(m256 a, m256 b) {
     return _mm256_xor_si256(a, b);
 }
-#else
-static really_inline m256 xor256(m256 a, m256 b) {
-    m256 rv;
-    rv.lo = xor128(a.lo, b.lo);
-    rv.hi = xor128(a.hi, b.hi);
-    return rv;
-}
-#endif
 
-#if defined(HAVE_AVX2)
 static really_inline m256 not256(m256 a) {
     return _mm256_xor_si256(a, ones256());
 }
-#else
-static really_inline m256 not256(m256 a) {
-    m256 rv;
-    rv.lo = not128(a.lo);
-    rv.hi = not128(a.hi);
-    return rv;
-}
-#endif
 
-#if defined(HAVE_AVX2)
 static really_inline m256 andnot256(m256 a, m256 b) {
     return _mm256_andnot_si256(a, b);
 }
-#else
-static really_inline m256 andnot256(m256 a, m256 b) {
-    m256 rv;
-    rv.lo = andnot128(a.lo, b.lo);
-    rv.hi = andnot128(a.hi, b.hi);
-    return rv;
-}
-#endif
 
 static really_inline int diff256(m256 a, m256 b) {
-#if defined(HAVE_AVX2)
     return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-1);
-#else
-    return diff128(a.lo, b.lo) || diff128(a.hi, b.hi);
-#endif
 }
 
 static really_inline int isnonzero256(m256 a) {
-#if defined(HAVE_AVX2)
     return !!diff256(a, zeroes256());
-#else
-    return isnonzero128(or128(a.lo, a.hi));
-#endif
 }
 
 /**
@@ -523,16 +382,8 @@ static really_inline int isnonzero256(m256 a) {
  * mask indicating which 32-bit words contain differences.
  */
 static really_inline u32 diffrich256(m256 a, m256 b) {
-#if defined(HAVE_AVX2)
     a = _mm256_cmpeq_epi32(a, b);
     return ~(_mm256_movemask_ps(_mm256_castsi256_ps(a))) & 0xFF;
-#else
-    m128 z = zeroes128();
-    a.lo = _mm_cmpeq_epi32(a.lo, b.lo);
-    a.hi = _mm_cmpeq_epi32(a.hi, b.hi);
-    m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.hi), z);
-    return ~(_mm_movemask_epi8(packed)) & 0xff;
-#endif
 }
 
 /**
@@ -547,24 +398,12 @@ static really_inline u32 diffrich64_256(m256 a, m256 b) {
 // aligned load
 static really_inline m256 load256(const void *ptr) {
     assert(ISALIGNED_N(ptr, alignof(m256)));
-#if defined(HAVE_AVX2)
     return _mm256_load_si256((const m256 *)ptr);
-#else
-    m256 rv = { load128(ptr), load128((const char *)ptr + 16) };
-    return rv;
-#endif
 }
 
 // aligned load  of 128-bit value to low and high part of 256-bit value
 static really_inline m256 load2x128(const void *ptr) {
-#if defined(HAVE_AVX2)
     return set1_2x128(load128(ptr));
-#else
-    assert(ISALIGNED_N(ptr, alignof(m128)));
-    m256 rv;
-    rv.hi = rv.lo = load128(ptr);
-    return rv;
-#endif
 }
 
 static really_inline m256 loadu2x128(const void *ptr) {
@@ -574,32 +413,17 @@ static really_inline m256 loadu2x128(const void *ptr) {
 // aligned store
 static really_inline void store256(void *ptr, m256 a) {
     assert(ISALIGNED_N(ptr, alignof(m256)));
-#if defined(HAVE_AVX2)
     _mm256_store_si256((m256 *)ptr, a);
-#else
-    ptr = assume_aligned(ptr, 16);
-    *(m256 *)ptr = a;
-#endif
 }
 
 // unaligned load
 static really_inline m256 loadu256(const void *ptr) {
-#if defined(HAVE_AVX2)
     return _mm256_loadu_si256((const m256 *)ptr);
-#else
-    m256 rv = { loadu128(ptr), loadu128((const char *)ptr + 16) };
-    return rv;
-#endif
 }
 
 // unaligned store
 static really_inline void storeu256(void *ptr, m256 a) {
-#if defined(HAVE_AVX2)
     _mm256_storeu_si256((m256 *)ptr, a);
-#else
-    storeu128(ptr, a.lo);
-    storeu128((char *)ptr + 16, a.hi);
-#endif
 }
 
 // packed unaligned store of first N bytes
@@ -628,101 +452,19 @@ m256 mask1bit256(unsigned int n) {
 
 static really_inline
 m256 set1_32x8(u32 in) {
-#if defined(HAVE_AVX2)
     return _mm256_set1_epi8(in);
-#else
-    m256 rv;
-    rv.hi = set1_16x8(in);
-    rv.lo = set1_16x8(in);
-    return rv;
-#endif
 }
 
 static really_inline
 m256 set8x32(u32 hi_3, u32 hi_2, u32 hi_1, u32 hi_0, u32 lo_3, u32 lo_2, u32 lo_1, u32 lo_0) {
-#if defined(HAVE_AVX2)
     return _mm256_set_epi32(hi_3, hi_2, hi_1, hi_0, lo_3, lo_2, lo_1, lo_0);
-#else
-    m256 rv;
-    rv.hi = set4x32(hi_3, hi_2, hi_1, hi_0);
-    rv.lo = set4x32(lo_3, lo_2, lo_1, lo_0);
-    return rv;
-#endif
 }
 
 static really_inline
 m256 set4x64(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) {
-#if defined(HAVE_AVX2)
     return _mm256_set_epi64x(hi_1, hi_0, lo_1, lo_0);
-#else
-    m256 rv;
-    rv.hi = set2x64(hi_1, hi_0);
-    rv.lo = set2x64(lo_1, lo_0);
-    return rv;
-#endif
 }
 
-#if !defined(HAVE_AVX2)
-// switches on bit N in the given vector.
-static really_inline
-void setbit256(m256 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-    m128 *sub;
-    if (n < 128) {
-        sub = &ptr->lo;
-    } else {
-        sub = &ptr->hi;
-        n -= 128;
-    }
-    setbit128(sub, n);
-}
-
-// switches off bit N in the given vector.
-static really_inline
-void clearbit256(m256 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-    m128 *sub;
-    if (n < 128) {
-        sub = &ptr->lo;
-    } else {
-        sub = &ptr->hi;
-        n -= 128;
-    }
-    clearbit128(sub, n);
-}
-
-// tests bit N in the given vector.
-static really_inline
-char testbit256(m256 val, unsigned int n) {
-    assert(n < sizeof(val) * 8);
-    m128 sub;
-    if (n < 128) {
-        sub = val.lo;
-    } else {
-        sub = val.hi;
-        n -= 128;
-    }
-    return testbit128(sub, n);
-}
-
-static really_really_inline
-m128 movdq_hi(m256 x) {
-    return x.hi;
-}
-
-static really_really_inline
-m128 movdq_lo(m256 x) {
-    return x.lo;
-}
-
-static really_inline
-m256 combine2x128(m128 hi, m128 lo) {
-    m256 rv = {lo, hi};
-    return rv;
-}
-
-#else // AVX2
-
 // switches on bit N in the given vector.
 static really_inline
 void setbit256(m256 *ptr, unsigned int n) {
@@ -775,88 +517,12 @@ m256 combine2x128(m128 hi, m128 lo) {
 }
 #endif //AVX2
 
-#if defined(HAVE_AVX512)
-#define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm)
-#define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b)
-#define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b)
-#define set2x256(a) _mm512_broadcast_i64x4(a)
-#define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a)
-#define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a)
-#endif
-
-/****
- **** 384-bit Primitives
- ****/
-
-static really_inline m384 and384(m384 a, m384 b) {
-    m384 rv;
-    rv.lo = and128(a.lo, b.lo);
-    rv.mid = and128(a.mid, b.mid);
-    rv.hi = and128(a.hi, b.hi);
-    return rv;
-}
-
-static really_inline m384 or384(m384 a, m384 b) {
-    m384 rv;
-    rv.lo = or128(a.lo, b.lo);
-    rv.mid = or128(a.mid, b.mid);
-    rv.hi = or128(a.hi, b.hi);
-    return rv;
-}
-
-static really_inline m384 xor384(m384 a, m384 b) {
-    m384 rv;
-    rv.lo = xor128(a.lo, b.lo);
-    rv.mid = xor128(a.mid, b.mid);
-    rv.hi = xor128(a.hi, b.hi);
-    return rv;
-}
-static really_inline m384 not384(m384 a) {
-    m384 rv;
-    rv.lo = not128(a.lo);
-    rv.mid = not128(a.mid);
-    rv.hi = not128(a.hi);
-    return rv;
-}
-static really_inline m384 andnot384(m384 a, m384 b) {
-    m384 rv;
-    rv.lo = andnot128(a.lo, b.lo);
-    rv.mid = andnot128(a.mid, b.mid);
-    rv.hi = andnot128(a.hi, b.hi);
-    return rv;
-}
-
-static really_really_inline
-m384 lshift64_m384(m384 a, unsigned b) {
-    m384 rv;
-    rv.lo = lshift64_m128(a.lo, b);
-    rv.mid = lshift64_m128(a.mid, b);
-    rv.hi = lshift64_m128(a.hi, b);
-    return rv;
-}
-
-static really_inline m384 zeroes384(void) {
-    m384 rv = {zeroes128(), zeroes128(), zeroes128()};
-    return rv;
-}
-
-static really_inline m384 ones384(void) {
-    m384 rv = {ones128(), ones128(), ones128()};
-    return rv;
-}
-
-static really_inline int diff384(m384 a, m384 b) {
-    return diff128(a.lo, b.lo) || diff128(a.mid, b.mid) || diff128(a.hi, b.hi);
-}
-
-static really_inline int isnonzero384(m384 a) {
-    return isnonzero128(or128(or128(a.lo, a.mid), a.hi));
-}
-
+#if defined(HAVE_SIMD_128_BITS)
 /**
  * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit
  * mask indicating which 32-bit words contain differences.
  */
+
 static really_inline u32 diffrich384(m384 a, m384 b) {
     m128 z = zeroes128();
     a.lo = _mm_cmpeq_epi32(a.lo, b.lo);
@@ -867,102 +533,42 @@ static really_inline u32 diffrich384(m384 a, m384 b) {
     return ~(_mm_movemask_epi8(packed)) & 0xfff;
 }
 
-/**
- * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and
- * returns a 12-bit mask indicating which 64-bit words contain differences.
- */
-static really_inline u32 diffrich64_384(m384 a, m384 b) {
-    u32 d = diffrich384(a, b);
-    return (d | (d >> 1)) & 0x55555555;
-}
-
-// aligned load
-static really_inline m384 load384(const void *ptr) {
-    assert(ISALIGNED_16(ptr));
-    m384 rv = { load128(ptr), load128((const char *)ptr + 16),
-                load128((const char *)ptr + 32) };
-    return rv;
-}
-
-// aligned store
-static really_inline void store384(void *ptr, m384 a) {
-    assert(ISALIGNED_16(ptr));
-    ptr = assume_aligned(ptr, 16);
-    *(m384 *)ptr = a;
-}
-
-// unaligned load
-static really_inline m384 loadu384(const void *ptr) {
-    m384 rv = { loadu128(ptr), loadu128((const char *)ptr + 16),
-                loadu128((const char *)ptr + 32)};
-    return rv;
-}
-
-// packed unaligned store of first N bytes
-static really_inline
-void storebytes384(void *ptr, m384 a, unsigned int n) {
-    assert(n <= sizeof(a));
-    memcpy(ptr, &a, n);
-}
-
-// packed unaligned load of first N bytes, pad with zero
-static really_inline
-m384 loadbytes384(const void *ptr, unsigned int n) {
-    m384 a = zeroes384();
-    assert(n <= sizeof(a));
-    memcpy(&a, ptr, n);
-    return a;
-}
-
-// switches on bit N in the given vector.
-static really_inline
-void setbit384(m384 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-    m128 *sub;
-    if (n < 128) {
-        sub = &ptr->lo;
-    } else if (n < 256) {
-        sub = &ptr->mid;
-    } else {
-        sub = &ptr->hi;
-    }
-    setbit128(sub, n % 128);
-}
-
-// switches off bit N in the given vector.
-static really_inline
-void clearbit384(m384 *ptr, unsigned int n) {
-    assert(n < sizeof(*ptr) * 8);
-    m128 *sub;
-    if (n < 128) {
-        sub = &ptr->lo;
-    } else if (n < 256) {
-        sub = &ptr->mid;
-    } else {
-        sub = &ptr->hi;
-    }
-    clearbit128(sub, n % 128);
-}
-
-// tests bit N in the given vector.
-static really_inline
-char testbit384(m384 val, unsigned int n) {
-    assert(n < sizeof(val) * 8);
-    m128 sub;
-    if (n < 128) {
-        sub = val.lo;
-    } else if (n < 256) {
-        sub = val.mid;
-    } else {
-        sub = val.hi;
-    }
-    return testbit128(sub, n % 128);
-}
+#endif // HAVE_SIMD_128_BITS
 
 /****
  **** 512-bit Primitives
  ****/
 
+#if defined(HAVE_SIMD_512_BITS)
+
+#define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm)
+#define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b)
+#define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b)
+#define set2x256(a) _mm512_broadcast_i64x4(a)
+#define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a)
+#define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a)
+
+static really_inline u32 movd512(const m512 in) {
+    // NOTE: seems gcc doesn't support _mm512_cvtsi512_si32(in),
+    //       so we use 2-step convertions to work around.
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(in));
+}
+
+static really_inline
+m512 pshufb_m512(m512 a, m512 b) {
+    return _mm512_shuffle_epi8(a, b);
+}
+
+static really_inline
+m512 maskz_pshufb_m512(__mmask64 k, m512 a, m512 b) {
+    return _mm512_maskz_shuffle_epi8(k, a, b);
+}
+
+#if defined(HAVE_AVX512VBMI)
+#define vpermb512(idx, a) _mm512_permutexvar_epi8(idx, a)
+#define maskz_vpermb512(k, idx, a) _mm512_maskz_permutexvar_epi8(k, idx, a)
+#endif
+
 #define eq512mask(a, b) _mm512_cmpeq_epi8_mask((a), (b))
 #define masked_eq512mask(k, a, b) _mm512_mask_cmpeq_epi8_mask((k), (a), (b))
 
@@ -978,16 +584,10 @@ m512 zeroes512(void) {
 
 static really_inline
 m512 ones512(void) {
-#if defined(HAVE_AVX512)
     return _mm512_set1_epi8(0xFF);
     //return _mm512_xor_si512(_mm512_setzero_si512(), _mm512_setzero_si512());
-#else
-    m512 rv = {ones256(), ones256()};
-    return rv;
-#endif
 }
 
-#if defined(HAVE_AVX512)
 static really_inline
 m512 set1_64x8(u8 a) {
     return _mm512_set1_epi8(a);
@@ -1015,69 +615,32 @@ static really_inline
 m512 set1_4x128(m128 a) {
     return _mm512_broadcast_i32x4(a);
 }
-#endif
 
 static really_inline
 m512 and512(m512 a, m512 b) {
-#if defined(HAVE_AVX512)
     return _mm512_and_si512(a, b);
-#else
-    m512 rv;
-    rv.lo = and256(a.lo, b.lo);
-    rv.hi = and256(a.hi, b.hi);
-    return rv;
-#endif
 }
 
 static really_inline
 m512 or512(m512 a, m512 b) {
-#if defined(HAVE_AVX512)
     return _mm512_or_si512(a, b);
-#else
-    m512 rv;
-    rv.lo = or256(a.lo, b.lo);
-    rv.hi = or256(a.hi, b.hi);
-    return rv;
-#endif
 }
 
 static really_inline
 m512 xor512(m512 a, m512 b) {
-#if defined(HAVE_AVX512)
     return _mm512_xor_si512(a, b);
-#else
-    m512 rv;
-    rv.lo = xor256(a.lo, b.lo);
-    rv.hi = xor256(a.hi, b.hi);
-    return rv;
-#endif
 }
 
 static really_inline
 m512 not512(m512 a) {
-#if defined(HAVE_AVX512)
     return _mm512_xor_si512(a, ones512());
-#else
-    m512 rv;
-    rv.lo = not256(a.lo);
-    rv.hi = not256(a.hi);
-    return rv;
-#endif
 }
 
 static really_inline
 m512 andnot512(m512 a, m512 b) {
-#if defined(HAVE_AVX512)
     return _mm512_andnot_si512(a, b);
-#else
-    m512 rv;
-    rv.lo = andnot256(a.lo, b.lo);
-    rv.hi = andnot256(a.hi, b.hi);
-    return rv;
-#endif
 }
 
-#if defined(HAVE_AVX512)
 static really_really_inline
 m512 lshift64_m512(m512 a, unsigned b) {
 #if defined(HAVE__BUILTIN_CONSTANT_P)
@@ -1088,21 +651,10 @@ m512 lshift64_m512(m512 a, unsigned b) {
     m128 x = _mm_cvtsi32_si128(b);
     return _mm512_sll_epi64(a, x);
 }
-#else
-static really_really_inline
-m512 lshift64_m512(m512 a, unsigned b) {
-    m512 rv;
-    rv.lo = lshift64_m256(a.lo, b);
-    rv.hi = lshift64_m256(a.hi, b);
-    return rv;
-}
-#endif
 
-#if defined(HAVE_AVX512)
 #define rshift64_m512(a, b) _mm512_srli_epi64((a), (b))
 #define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed)
 #define lshift128_m512(a, count_immed) _mm512_bslli_epi128(a, count_immed)
-#endif
 
 #if !defined(_MM_CMPINT_NE)
 #define _MM_CMPINT_NE 0x4
@@ -1110,25 +662,12 @@ m512 lshift64_m512(m512 a, unsigned b) {
 
 static really_inline
 int diff512(m512 a, m512 b) {
-#if defined(HAVE_AVX512)
     return !!_mm512_cmp_epi8_mask(a, b, _MM_CMPINT_NE);
-#else
-    return diff256(a.lo, b.lo) || diff256(a.hi, b.hi);
-#endif
 }
 
 static really_inline
 int isnonzero512(m512 a) {
-#if defined(HAVE_AVX512)
     return diff512(a, zeroes512());
-#elif defined(HAVE_AVX2)
-    m256 x = or256(a.lo, a.hi);
-    return !!diff256(x, zeroes256());
-#else
-    m128 x = or128(a.lo.lo, a.lo.hi);
-    m128 y = or128(a.hi.lo, a.hi.hi);
-    return isnonzero128(or128(x, y));
-#endif
 }
 
 /**
@@ -1137,19 +676,7 @@ int isnonzero512(m512 a) {
  */
 static really_inline
 u32 diffrich512(m512 a, m512 b) {
-#if defined(HAVE_AVX512)
     return _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_NE);
-#elif defined(HAVE_AVX2)
-    return diffrich256(a.lo, b.lo) | (diffrich256(a.hi, b.hi) << 8);
-#else
-    a.lo.lo = _mm_cmpeq_epi32(a.lo.lo, b.lo.lo);
-    a.lo.hi = _mm_cmpeq_epi32(a.lo.hi, b.lo.hi);
-    a.hi.lo = _mm_cmpeq_epi32(a.hi.lo, b.hi.lo);
-    a.hi.hi = _mm_cmpeq_epi32(a.hi.hi, b.hi.hi);
-    m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo.lo, a.lo.hi),
-                                  _mm_packs_epi32(a.hi.lo, a.hi.hi));
-    return ~(_mm_movemask_epi8(packed)) & 0xffff;
-#endif
 }
 
 /**
@@ -1166,43 +693,22 @@ u32 diffrich64_512(m512 a, m512 b) {
 // aligned load
 static really_inline
 m512 load512(const void *ptr) {
-#if defined(HAVE_AVX512)
     return _mm512_load_si512(ptr);
-#else
-    assert(ISALIGNED_N(ptr, alignof(m256)));
-    m512 rv = { load256(ptr), load256((const char *)ptr + 32) };
-    return rv;
-#endif
 }
 
 // aligned store
 static really_inline
 void store512(void *ptr, m512 a) {
     assert(ISALIGNED_N(ptr, alignof(m512)));
-#if defined(HAVE_AVX512)
     return _mm512_store_si512(ptr, a);
-#elif defined(HAVE_AVX2)
-    m512 *x = (m512 *)ptr;
-    store256(&x->lo, a.lo);
-    store256(&x->hi, a.hi);
-#else
-    ptr = assume_aligned(ptr, 16);
-    *(m512 *)ptr = a;
-#endif
 }
 
 // unaligned load
 static really_inline
 m512 loadu512(const void *ptr) {
-#if defined(HAVE_AVX512)
     return _mm512_loadu_si512(ptr);
-#else
-    m512 rv = { loadu256(ptr), loadu256((const char *)ptr + 32) };
-    return rv;
-#endif
 }
 
-#if defined(HAVE_AVX512)
 static really_inline
 m512 loadu_maskz_m512(__mmask64 k, const void *ptr) {
     return _mm512_maskz_loadu_epi8(k, ptr);
@@ -1217,7 +723,6 @@ static really_inline
 m512 set_mask_m512(__mmask64 k) {
     return _mm512_movm_epi8(k);
 }
-#endif
 
 // packed unaligned store of first N bytes
 static really_inline
@@ -1247,91 +752,24 @@ m512 mask1bit512(unsigned int n) {
 static really_inline
 void setbit512(m512 *ptr, unsigned int n) {
     assert(n < sizeof(*ptr) * 8);
-#if !defined(HAVE_AVX2)
-    m128 *sub;
-    if (n < 128) {
-        sub = &ptr->lo.lo;
-    } else if (n < 256) {
-        sub = &ptr->lo.hi;
-    } else if (n < 384) {
-        sub = &ptr->hi.lo;
-    } else {
-        sub = &ptr->hi.hi;
-    }
-    setbit128(sub, n % 128);
-#elif defined(HAVE_AVX512)
     *ptr = or512(mask1bit512(n), *ptr);
-#else
-    m256 *sub;
-    if (n < 256) {
-        sub = &ptr->lo;
-    } else {
-        sub = &ptr->hi;
-        n -= 256;
-    }
-    setbit256(sub, n);
-#endif
 }
 
 // switches off bit N in the given vector.
 static really_inline
 void clearbit512(m512 *ptr, unsigned int n) {
     assert(n < sizeof(*ptr) * 8);
-#if !defined(HAVE_AVX2)
-    m128 *sub;
-    if (n < 128) {
-        sub = &ptr->lo.lo;
-    } else if (n < 256) {
-        sub = &ptr->lo.hi;
-    } else if (n < 384) {
-        sub = &ptr->hi.lo;
-    } else {
-        sub = &ptr->hi.hi;
-    }
-    clearbit128(sub, n % 128);
-#elif defined(HAVE_AVX512)
     *ptr = andnot512(mask1bit512(n), *ptr);
-#else
-    m256 *sub;
-    if (n < 256) {
-        sub = &ptr->lo;
-    } else {
-        sub = &ptr->hi;
-        n -= 256;
-    }
-    clearbit256(sub, n);
-#endif
 }
 
 // tests bit N in the given vector.
 static really_inline
 char testbit512(m512 val, unsigned int n) {
     assert(n < sizeof(val) * 8);
-#if !defined(HAVE_AVX2)
-    m128 sub;
-    if (n < 128) {
-        sub = val.lo.lo;
-    } else if (n < 256) {
-        sub = val.lo.hi;
-    } else if (n < 384) {
-        sub = val.hi.lo;
-    } else {
-        sub = val.hi.hi;
-    }
-    return testbit128(sub, n % 128);
-#elif defined(HAVE_AVX512)
     const m512 mask = mask1bit512(n);
     return !!_mm512_test_epi8_mask(mask, val);
-#else
-    m256 sub;
-    if (n < 256) {
-        sub = val.lo;
-    } else {
-        sub = val.hi;
-        n -= 256;
-    }
-    return testbit256(sub, n);
-#endif
 }
 
+#endif // HAVE_SIMD_512_BITS
+
 #endif // ARCH_X86_SIMD_UTILS_H

From 4bce012570ee4606528bf67561c0a49c0c3389e3 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <markos@freevec.org>
Date: Fri, 16 Oct 2020 12:32:44 +0300
Subject: [PATCH 30/53] Revert "move x86 popcount.h implementations to
 util/arch/x86/popcount.h"

This reverts commit 6581aae90e55520353c03edb716de80ecc03521a.
---
 src/util/arch/common/popcount.h | 60 -----------------------------
 src/util/arch/x86/popcount.h    | 67 ---------------------------------
 src/util/popcount.h             | 39 +++++++++++++------
 3 files changed, 27 insertions(+), 139 deletions(-)
 delete mode 100644 src/util/arch/common/popcount.h
 delete mode 100644 src/util/arch/x86/popcount.h

diff --git a/src/util/arch/common/popcount.h b/src/util/arch/common/popcount.h
deleted file mode 100644
index ef5776e8..00000000
--- a/src/util/arch/common/popcount.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2015-2017, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-/** \file
- * \brief Platform specific popcount functions
- */
-
-#ifndef POPCOUNT_ARCH_COMMON_H
-#define POPCOUNT_ARCH_COMMON_H
-
-static really_inline
-u32 popcount32_impl_c(u32 x) {
-    // Fast branch-free version from bit-twiddling hacks as older Intel
-    // processors do not have a POPCNT instruction.
-    x -= (x >> 1) & 0x55555555;
-    x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
-    return (((x + (x >> 4)) & 0xf0f0f0f) * 0x1010101) >> 24;
-}
-
-static really_inline
-u32 popcount64_impl_c(u64a x) {
-#if defined(ARCH_64_BIT)
-    // Fast branch-free version from bit-twiddling hacks as older Intel
-    // processors do not have a POPCNT instruction.
-    x -= (x >> 1) & 0x5555555555555555;
-    x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333);
-    x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f;
-    return (x * 0x0101010101010101) >> 56;
-#else
-    // Synthesise from two 32-bit cases.
-    return popcount32_impl_c(x >> 32) + popcount32_impl_c(x);
-#endif
-}
-
-#endif // POPCOUNT_ARCH_COMMON_H
diff --git a/src/util/arch/x86/popcount.h b/src/util/arch/x86/popcount.h
deleted file mode 100644
index 86929ede..00000000
--- a/src/util/arch/x86/popcount.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2015-2017, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-/** \file
- * \brief Platform specific popcount functions
- */
-
-#ifndef POPCOUNT_ARCH_X86_H
-#define POPCOUNT_ARCH_X86_H
-
-#include "ue2common.h"
-#include "util/arch.h"
-#include "util/intrinsics.h"
-
-#include "util/arch/common/popcount.h"
-
-static really_inline
-u32 popcount32_impl(u32 x) {
-#if defined(HAVE_POPCOUNT_INSTR)
-    // Single-instruction builtin.
-    return _mm_popcnt_u32(x);
-#else
-    return popcount32_impl_c(x);
-#endif
-}
-
-static really_inline
-u32 popcount64_impl(u64a x) {
-#if defined(ARCH_X86_64)
-# if defined(HAVE_POPCOUNT_INSTR)
-    // Single-instruction builtin.
-    return (u32)_mm_popcnt_u64(x);
-# else
-    return popcount64_impl_c(x);
-# endif
-#else
-    // Synthesise from two 32-bit cases.
-    return popcount32_impl(x >> 32) + popcount32_impl(x);
-#endif
-}
-
-#endif // POPCOUNT_ARCH_X86_h
\ No newline at end of file
diff --git a/src/util/popcount.h b/src/util/popcount.h
index 5fd6dc33..eb08f6b1 100644
--- a/src/util/popcount.h
+++ b/src/util/popcount.h
@@ -33,26 +33,41 @@
 #ifndef UTIL_POPCOUNT_H_
 #define UTIL_POPCOUNT_H_
 
-#include "config.h"
 #include "ue2common.h"
 #include "util/arch.h"
 
-#if defined(ARCH_IA32) || defined(ARCH_X86_64)
-#include "util/arch/x86/popcount.h"
-#else
-#include "util/arch/common/popcount.h"
-#define popcount32_impl(x) popcount32_impl_c(x)
-#define popcount64_impl(x) popcount64_impl_c(x)
-#endif
-
 static really_inline
 u32 popcount32(u32 x) {
-    return popcount32_impl(x);
+#if defined(HAVE_POPCOUNT_INSTR)
+    // Single-instruction builtin.
+    return _mm_popcnt_u32(x);
+#else
+    // Fast branch-free version from bit-twiddling hacks as older Intel
+    // processors do not have a POPCNT instruction.
+    x -= (x >> 1) & 0x55555555;
+    x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
+    return (((x + (x >> 4)) & 0xf0f0f0f) * 0x1010101) >> 24;
+#endif
 }
 
 static really_inline
-u32 popcount64(u32 x) {
-    return popcount64_impl(x);
+u32 popcount64(u64a x) {
+#if defined(ARCH_X86_64)
+# if defined(HAVE_POPCOUNT_INSTR)
+    // Single-instruction builtin.
+    return (u32)_mm_popcnt_u64(x);
+# else
+    // Fast branch-free version from bit-twiddling hacks as older Intel
+    // processors do not have a POPCNT instruction.
+    x -= (x >> 1) & 0x5555555555555555;
+    x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333);
+    x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f;
+    return (x * 0x0101010101010101) >> 56;
+# endif
+#else
+    // Synthesise from two 32-bit cases.
+    return popcount32(x >> 32) + popcount32(x);
+#endif
 }
 
 #endif /* UTIL_POPCOUNT_H_ */

From c4db63665ad98115948f6c327f6f9952ecb49dd2 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Fri, 16 Oct 2020 13:02:40 +0300
Subject: [PATCH 31/53] scalar implementations of diffrich256 and diffrich384

---
 src/util/arch/arm/cpuid_flags.c   |  4 ++--
 src/util/arch/common/simd_utils.h | 11 ++++++++++-
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/util/arch/arm/cpuid_flags.c b/src/util/arch/arm/cpuid_flags.c
index 8dbab473..1ba1a497 100644
--- a/src/util/arch/arm/cpuid_flags.c
+++ b/src/util/arch/arm/cpuid_flags.c
@@ -26,13 +26,13 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "cpuid_flags.h"
+#include "util/arch/common/cpuid_flags.h"
 #include "ue2common.h"
 #include "hs_compile.h" // for HS_MODE_ flags
 #include "util/arch.h"
 
 u64a cpuid_flags(void) {
-     return cap;
+     return 0;
 }
 
 u32 cpuid_tune(void) {
diff --git a/src/util/arch/common/simd_utils.h b/src/util/arch/common/simd_utils.h
index 56d9dbaf..25cd03cc 100644
--- a/src/util/arch/common/simd_utils.h
+++ b/src/util/arch/common/simd_utils.h
@@ -152,7 +152,7 @@ static really_inline int isnonzero256(m256 a) {
  */
 static really_inline
 u32 diffrich256(m256 a, m256 b) {
-    return diffrich128(a.lo, b.lo) | (diffrich128(a.hi, b.hi) << 8);
+    return diffrich128(a.lo, b.lo) | (diffrich128(a.hi, b.hi) << 4);
 }
 
 /**
@@ -384,6 +384,15 @@ static really_inline int isnonzero384(m384 a) {
     return isnonzero128(or128(or128(a.lo, a.mid), a.hi));
 }
 
+/**
+ * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit
+ * mask indicating which 32-bit words contain differences.
+ */
+static really_inline
+u32 diffrich384(m384 a, m384 b) {
+    return diffrich128(a.lo, b.lo) | (diffrich128(a.mid, b.mid) << 4) | (diffrich128(a.hi, b.hi) << 8);
+}
+
 /**
  * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and
  * returns a 12-bit mask indicating which 64-bit words contain differences.

From 149ea938c4412611f555c0c88af02666d7ccea23 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <markos@freevec.org>
Date: Fri, 16 Oct 2020 13:09:08 +0300
Subject: [PATCH 32/53] don't redefine function on x86

---
 src/util/arch/common/simd_utils.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/util/arch/common/simd_utils.h b/src/util/arch/common/simd_utils.h
index 25cd03cc..c16023ac 100644
--- a/src/util/arch/common/simd_utils.h
+++ b/src/util/arch/common/simd_utils.h
@@ -384,6 +384,7 @@ static really_inline int isnonzero384(m384 a) {
     return isnonzero128(or128(or128(a.lo, a.mid), a.hi));
 }
 
+#if defined(HAVE_SIMD_128_BITS) && !defined(ARCH_IA32) && !defined(ARCH_X86_64)
 /**
  * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit
  * mask indicating which 32-bit words contain differences.
@@ -392,6 +393,7 @@ static really_inline
 u32 diffrich384(m384 a, m384 b) {
     return diffrich128(a.lo, b.lo) | (diffrich128(a.mid, b.mid) << 4) | (diffrich128(a.hi, b.hi) << 8);
 }
+#endif
 
 /**
  * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and

From 0bef151437dcabce2b5541d7746c59286ce1a6d3 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Fri, 30 Oct 2020 10:38:05 +0200
Subject: [PATCH 33/53] don't use SSE directly in the tests

---
 unit/internal/simd_utils.cpp | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp
index 623c2c99..5c0e0b40 100644
--- a/unit/internal/simd_utils.cpp
+++ b/unit/internal/simd_utils.cpp
@@ -658,34 +658,41 @@ TEST(SimdUtilsTest, movq) {
 
     char cmp[sizeof(m128)];
     memset(cmp, 0x80, sizeof(m128));
-    simd = set16x8(0x80);
+    simd = set1_16x8(0x80);
     r = movq(simd);
     ASSERT_EQ(0, memcmp(cmp, &simd, sizeof(simd)));
     ASSERT_EQ(0, memcmp(cmp, &r, sizeof(r)));
 
+#if defined(HAVE_SIMD_128_BITS)
+#if defined(ARCH_IA32) || defined(ARCH_X86_64)
     simd = _mm_set_epi64x(~0LL, 0x123456789abcdef);
+#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
+    int64x2_t a = { ~0LL, 0x123456789abcdefLL };
+    simd = vreinterpretq_s64_s8(a);
+#endif
+#endif
     r = movq(simd);
     ASSERT_EQ(r, 0x123456789abcdef);
 }
 
 
-TEST(SimdUtilsTest, set16x8) {
+TEST(SimdUtilsTest, set1_16x8) {
     char cmp[sizeof(m128)];
 
     for (unsigned i = 0; i < 256; i++) {
-        m128 simd = set16x8(i);
+        m128 simd = set1_16x8(i);
         memset(cmp, i, sizeof(simd));
         ASSERT_EQ(0, memcmp(cmp, &simd, sizeof(simd)));
     }
 }
 
-TEST(SimdUtilsTest, set4x32) {
+TEST(SimdUtilsTest, set1_4x32) {
     u32 cmp[4] = { 0x12345678, 0x12345678, 0x12345678, 0x12345678 };
-    m128 simd = set4x32(cmp[0]);
+    m128 simd = set1_4x32(cmp[0]);
     ASSERT_EQ(0, memcmp(cmp, &simd, sizeof(simd)));
 }
 
-#if defined(HAVE_AVX2)
+#if defined(HAVE_SIMD_256_BITS)
 TEST(SimdUtilsTest, set32x8) {
     char cmp[sizeof(m256)];
 

From 548242981d46ff30798b7cd567dc9bab0c296f77 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Fri, 30 Oct 2020 10:38:41 +0200
Subject: [PATCH 34/53] fix ARM implementations

---
 src/util/arch/arm/simd_utils.h | 59 ++++++++++++++++++----------------
 1 file changed, 31 insertions(+), 28 deletions(-)

diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h
index 74f447fb..bfcb9bfe 100644
--- a/src/util/arch/arm/simd_utils.h
+++ b/src/util/arch/arm/simd_utils.h
@@ -33,6 +33,8 @@
 #ifndef ARCH_ARM_SIMD_UTILS_H
 #define ARCH_ARM_SIMD_UTILS_H
 
+#include <stdio.h>
+
 #include "ue2common.h"
 #include "util/simd_types.h"
 #include "util/unaligned.h"
@@ -41,7 +43,7 @@
 #include <string.h> // for memcpy
 
 static really_inline m128 ones128(void) {
-    return (m128) vdupq_n_s32(0xFF);
+    return (m128) vdupq_n_s8(0xFF);
 }
 
 static really_inline m128 zeroes128(void) {
@@ -50,13 +52,13 @@ static really_inline m128 zeroes128(void) {
 
 /** \brief Bitwise not for m128*/
 static really_inline m128 not128(m128 a) {
-    return (m128) veorq_s32(a, a);
+    return (m128) vmvnq_s32(a);
 }
 
 /** \brief Return 1 if a and b are different otherwise 0 */
 static really_inline int diff128(m128 a, m128 b) {
-    m128 t = (m128)vceqq_s8((int8x16_t)a, (int8x16_t)b);
-    return (16 != vaddvq_u8((uint8x16_t)t));
+    int res = vaddvq_s8((int8x16_t) vceqq_s32(a, b));
+    return (-16 != res);
 }
 
 static really_inline int isnonzero128(m128 a) {
@@ -69,7 +71,7 @@ static really_inline int isnonzero128(m128 a) {
  */
 static really_inline u32 diffrich128(m128 a, m128 b) {
     static const uint32x4_t movemask = { 1, 2, 4, 8 };
-    return vaddvq_u32(vandq_u32(vceqq_s32((int32x4_t)a, (int32x4_t)b), movemask));
+    return vaddvq_u32(vandq_u32(vmvnq_s32(vceqq_s32((int32x4_t)a, (int32x4_t)b)), movemask));
 }
 
 /**
@@ -77,8 +79,8 @@ static really_inline u32 diffrich128(m128 a, m128 b) {
  * returns a 4-bit mask indicating which 64-bit words contain differences.
  */
 static really_inline u32 diffrich64_128(m128 a, m128 b) {
-    static const uint64x2_t movemask = { 1, 2 };
-    return vaddvq_u64(vandq_u64(vceqq_s64((int64x2_t)a, (int64x2_t)b), movemask));
+    static const uint64x2_t movemask = { 1, 4 };
+    return vaddvq_u64(vandq_u64(vmvnq_s32(vceqq_s64((int64x2_t)a, (int64x2_t)b)), movemask));
 }
 
 static really_really_inline
@@ -125,7 +127,7 @@ static really_inline u32 movd(const m128 in) {
 }
 
 static really_inline u64a movq(const m128 in) {
-    return vgetq_lane_u64((uint64x2_t) in, 0);
+    return vgetq_lane_u64((uint64x2_t) in, 1);
 }
 
 /* another form of movq */
@@ -134,16 +136,6 @@ m128 load_m128_from_u64a(const u64a *p) {
     return (m128) vdupq_n_u64(*p);
 }
 
-static really_really_inline
-m128 rshiftbyte_m128(m128 a, unsigned b) {
-    return (m128) vshrq_n_s8((int8x16_t)a, b);
-}
-
-static really_really_inline
-m128 lshiftbyte_m128(m128 a, unsigned b) {
-    return (m128) vshlq_n_s8((int8x16_t)a, b);
-}
-
 static really_inline u32 extract32from128(const m128 in, unsigned imm) {
     return vgetq_lane_u32((uint32x4_t) in, imm);
 }
@@ -165,7 +157,7 @@ static really_inline m128 or128(m128 a, m128 b) {
 }
 
 static really_inline m128 andnot128(m128 a, m128 b) {
-    return (m128) vbicq_u32((uint32x4_t)a, (uint32x4_t)b);
+    return (m128) (m128) vandq_s8( vmvnq_s8(a), b);
 }
 
 // aligned load
@@ -208,6 +200,24 @@ m128 loadbytes128(const void *ptr, unsigned int n) {
     return a;
 }
 
+static really_inline
+m128 variable_byte_shift_m128(m128 in, s32 amount) {
+    assert(amount >= -16 && amount <= 16);
+    m128 shift_mask = loadu128(vbs_mask_data + 16 - amount);
+    return vqtbl1q_s8(in, shift_mask);
+}
+
+static really_really_inline
+m128 rshiftbyte_m128(m128 a, unsigned b) {
+    return variable_byte_shift_m128(a, -b);;
+}
+
+static really_really_inline
+m128 lshiftbyte_m128(m128 a, unsigned b) {
+    return variable_byte_shift_m128(a, b);;
+}
+
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -258,21 +268,14 @@ m128 pshufb_m128(m128 a, m128 b) {
     return (m128)vqtbl1q_s8((int8x16_t)a, (uint8x16_t)btranslated);
 }
 
-static really_inline
-m128 variable_byte_shift_m128(m128 in, s32 amount) {
-    assert(amount >= -16 && amount <= 16);
-    m128 shift_mask = loadu128(vbs_mask_data + 16 - amount);
-    return pshufb_m128(in, shift_mask);
-}
-
 static really_inline
 m128 max_u8_m128(m128 a, m128 b) {
-    return (m128) vmaxq_s8((int8x16_t)a, (int8x16_t)b);
+    return (m128) vmaxq_u8((int8x16_t)a, (int8x16_t)b);
 }
 
 static really_inline
 m128 min_u8_m128(m128 a, m128 b) {
-    return (m128) vminq_s8((int8x16_t)a, (int8x16_t)b);
+    return (m128) vminq_u8((int8x16_t)a, (int8x16_t)b);
 }
 
 static really_inline

From 547f79b920771614d27e790e1e68221a8ab5c69f Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Fri, 30 Oct 2020 10:49:50 +0200
Subject: [PATCH 35/53] small optimization in storecompress*()

---
 src/util/state_compress.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/util/state_compress.c b/src/util/state_compress.c
index 87eccce7..fa07eb2b 100644
--- a/src/util/state_compress.c
+++ b/src/util/state_compress.c
@@ -108,10 +108,10 @@ void storecompressed128_32bit(void *ptr, m128 xvec, m128 mvec) {
 static really_inline
 void storecompressed128_64bit(void *ptr, m128 xvec, m128 mvec) {
     // First, decompose our vectors into 64-bit chunks.
-    u64a x[2];
-    memcpy(x, &xvec, sizeof(xvec));
-    u64a m[2];
-    memcpy(m, &mvec, sizeof(mvec));
+    u64a ALIGN_ATTR(16) x[2];
+    u64a ALIGN_ATTR(16) m[2];
+    store128(x, xvec);
+    store128(m, mvec);
 
     // Count the number of bits of compressed state we're writing out per
     // chunk.
@@ -215,10 +215,10 @@ void storecompressed256_32bit(void *ptr, m256 xvec, m256 mvec) {
 static really_really_inline
 void storecompressed256_64bit(void *ptr, m256 xvec, m256 mvec) {
     // First, decompose our vectors into 64-bit chunks.
-    u64a x[4];
-    memcpy(x, &xvec, sizeof(xvec));
-    u64a m[4];
-    memcpy(m, &mvec, sizeof(mvec));
+    u64a ALIGN_ATTR(32) x[4];
+    u64a ALIGN_ATTR(32) m[4];
+    store256(x, xvec);
+    store256(m, mvec);
 
     // Count the number of bits of compressed state we're writing out per
     // chunk.

From 592b1905afdf175e124c5a1bd1282df718e559c6 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Fri, 30 Oct 2020 10:50:24 +0200
Subject: [PATCH 36/53] needed for ARM vector type conversions

---
 CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4077d396..55954384 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -288,6 +288,8 @@ if (ARCH_IA32 OR ARCH_X86_64)
   CHECK_INCLUDE_FILE_CXX(x86intrin.h HAVE_CXX_X86INTRIN_H)
 elseif (ARCH_ARM32 OR ARCH_AARCH64)
   CHECK_INCLUDE_FILE_CXX(arm_neon.h HAVE_C_ARM_NEON_H)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -flax-vector-conversions")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -flax-vector-conversions")
 endif()
 
 CHECK_FUNCTION_EXISTS(posix_memalign HAVE_POSIX_MEMALIGN)

From 18296eee4715f8c03ddb3935441c0ea11d08b450 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Thu, 5 Nov 2020 17:31:20 +0200
Subject: [PATCH 37/53] fix 32-bit/64-bit detection

---
 cmake/platform.cmake | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cmake/platform.cmake b/cmake/platform.cmake
index 4591bf93..479b3680 100644
--- a/cmake/platform.cmake
+++ b/cmake/platform.cmake
@@ -5,10 +5,10 @@ CHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error n
 
 CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_IA32)
 
-CHECK_C_SOURCE_COMPILES("#if !defined(__aarch64__)\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_AARCH64)
-CHECK_C_SOURCE_COMPILES("#if !(defined(__arm__) && !defined(__aarch64__))\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_ARM32)
+CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_A64)\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_AARCH64)
+CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_ARM)\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_ARM32)
 
-if (DEFINED(ARCH_X86_64) OR DEFINED(ARCH_AARCH64))
+if (ARCH_X86_64 OR ARCH_AARCH64)
   set(ARCH_64_BIT TRUE)
 else()
   set(ARCH_32_BIT TRUE)

From 7b8cf9754638e963d20f0e1ee32b97a9de596d0c Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Thu, 5 Nov 2020 19:18:53 +0200
Subject: [PATCH 38/53] add extra instructions (currently arm-only), fix order
 of elements in set4x32/set2x64

---
 src/util/arch/arm/simd_utils.h | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h
index bfcb9bfe..7c5d11d5 100644
--- a/src/util/arch/arm/simd_utils.h
+++ b/src/util/arch/arm/simd_utils.h
@@ -83,6 +83,26 @@ static really_inline u32 diffrich64_128(m128 a, m128 b) {
     return vaddvq_u64(vandq_u64(vmvnq_s32(vceqq_s64((int64x2_t)a, (int64x2_t)b)), movemask));
 }
 
+static really_really_inline
+m128 add_2x64(m128 a, m128 b) {
+    return (m128) vaddq_u64((int64x2_t)a, (int64x2_t)b);
+}
+
+static really_really_inline
+m128 sub_2x64(m128 a, m128 b) {
+    return (m128) vsubq_u64((int64x2_t)a, (int64x2_t)b);
+}
+
+static really_really_inline
+m128 lshift_m128(m128 a, unsigned b) {
+    return (m128) vshlq_n_s32((int64x2_t)a, b);
+}
+
+static really_really_inline
+m128 rshift_m128(m128 a, unsigned b) {
+    return (m128) vshrq_n_s32((int64x2_t)a, b);
+}
+
 static really_really_inline
 m128 lshift64_m128(m128 a, unsigned b) {
     return (m128) vshlq_n_s64((int64x2_t)a, b);
@@ -97,6 +117,10 @@ static really_inline m128 eq128(m128 a, m128 b) {
     return (m128) vceqq_s8((int8x16_t)a, (int8x16_t)b);
 }
 
+static really_inline m128 eq64_m128(m128 a, m128 b) {
+    return (m128) vceqq_u64((int64x2_t)a, (int64x2_t)b);
+}
+
 static really_inline u32 movemask128(m128 a) {
     static const uint8x16_t powers = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 };
 
@@ -290,13 +314,13 @@ m128 sub_u8_m128(m128 a, m128 b) {
 
 static really_inline
 m128 set4x32(u32 x3, u32 x2, u32 x1, u32 x0) {
-    uint32_t __attribute__((aligned(16))) data[4] = { x3, x2, x1, x0 };
+    uint32_t __attribute__((aligned(16))) data[4] = { x0, x1, x2, x3 };
     return (m128) vld1q_u32((uint32_t *) data);
 }
 
 static really_inline
 m128 set2x64(u64a hi, u64a lo) {
-    uint64_t __attribute__((aligned(16))) data[2] = { hi, lo };
+    uint64_t __attribute__((aligned(16))) data[2] = { lo, hi };
     return (m128) vld1q_u64((uint64_t *) data);
 }
 

From 33904180d87390b7f67d0c429bc8ac6255b6d97e Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Thu, 5 Nov 2020 19:20:06 +0200
Subject: [PATCH 39/53] add compress128 function and implementation

---
 src/util/arch/arm/bitutils.h    | 102 ++++++++++++++++++++++++++++++++
 src/util/arch/common/bitutils.h |  34 +++++++++--
 src/util/arch/x86/bitutils.h    |   5 ++
 src/util/bitutils.h             |   5 ++
 4 files changed, 142 insertions(+), 4 deletions(-)

diff --git a/src/util/arch/arm/bitutils.h b/src/util/arch/arm/bitutils.h
index 514ddc5c..0b579dc9 100644
--- a/src/util/arch/arm/bitutils.h
+++ b/src/util/arch/arm/bitutils.h
@@ -104,6 +104,108 @@ u64a compress64_impl(u64a x, u64a m) {
     return compress64_impl_c(x, m);
 }
 
+static really_inline
+m128 compress128_impl(m128 x, m128 m) {
+
+/*    x = and128(x, m); // clear irrelevant bits
+
+    // Return zero quickly on trivial cases
+    if (diff128(x, zeroes128()) == 0) {
+        return zeroes128();
+    }*/
+
+
+    u64a ALIGN_ATTR(16) xv[2];
+    u64a ALIGN_ATTR(16) mv[2];
+    u64a ALIGN_ATTR(16) res[2];
+    u64a ALIGN_ATTR(16) t[2];
+    u64a ALIGN_ATTR(16) bbv[2];
+    store128(xv, x);
+    store128(mv, m);
+    res[0] = 0;
+    res[1] = 0;
+    printf("x[%d] = %0llx\n", 0, xv[0]);
+    printf("x[%d] = %0llx\n", 1, xv[1]);
+
+    m128 one = set1_2x64(1);
+    m128 bitset = one;
+    m128 vres = zeroes128();
+    for (u64a bb = 1; mv[0] | mv[1]; bb <<= 1) {
+        printf("bb = %lld\n", bb);
+	store128(bbv, bitset);
+        printf("bb[%d] = %0lld\n", 0, bbv[0]);
+        printf("bb[%d] = %0lld\n", 1, bbv[1]);
+        printf("m[%d] = %0llx\n", 0, mv[0]);
+        printf("m[%d] = %0llx\n", 1, mv[1]);
+        printf("scalar: -m[%d] = %0llx\n", 0, -mv[0]);
+        printf("scalar: -m[%d] = %0llx\n", 1, -mv[1]);
+	m128 mm = sub_2x64(zeroes128(), m);
+	store128(t, mm);
+        printf("vector: -m[0] = %0llx\n", t[0]);
+        printf("vector: -m[1] = %0llx\n", t[1]);
+	m128 tv = and128(x, m);
+	store128(t, tv);
+        printf("vector: x[0] & m[0] = %0llx\n", t[0]);
+        printf("vector: x[1] & m[1] = %0llx\n", t[1]);
+	tv = and128(tv, mm);
+	store128(t, tv);
+        printf("vector: x[0] & m[0] & -m[0] = %0llx\n", t[0]);
+        printf("vector: x[1] & m[1] & -m[1] = %0llx\n", t[1]);
+        t[0] = xv[0] & mv[0];
+        t[1] = xv[1] & mv[1];
+        printf("scalar: x[0] & m[0] = %0llx\n", t[0]);
+        printf("scalar: x[1] & m[1] = %0llx\n", t[1]);
+        t[0] = xv[0] & mv[0] & -mv[0];
+        t[1] = xv[1] & mv[1] & -mv[1];
+        printf("scalar: x[0] & m[0] & -m[0] = %0llx\n", t[0]);
+        printf("scalar: x[1] & m[1] & -m[1] = %0llx\n", t[1]);
+        
+        if ( t[0] ) {
+            printf("x & m & -m != 0\n");
+            res[0] |= bb;
+            printf("x[%d] = %0llx\n", 0, xv[0]);
+        }
+        if ( t[1] ) {
+            printf("x & m & -m != 0\n");
+            res[1] |= bb;
+            printf("x[%d] = %0llx\n", 1, xv[1]);
+        }
+
+	m128 mask = not128(eq64_m128(tv, zeroes128()));
+	store128(t, mask);
+        printf("mask: x[0] & m[0] & -m[0] != 0 : %0llx\n", t[0]);
+        printf("mask: x[1] & m[1] & -m[1] != 0 : %0llx\n", t[1]);
+
+	mask = vandq_s64(bitset, mask);
+	store128(t, mask);
+        printf("mask: mask[0] & bitset[1] != 0 : %0llx\n", t[0]);
+        printf("mask: mask[1] & bitset[1] != 0 : %0llx\n", t[1]);
+
+        vres = or128(vres, mask);
+	store128(t, vres);
+        printf("res: res[0] != 0 : %0llx\n", t[0]);
+        printf("res: res[1] != 0 : %0llx\n", t[1]);
+	if (t[0] != res[0]) {
+            printf("mismatch: t[0] != res[0]: %0llx != %0llx\n", t[0], res[0]);
+        }
+	if (t[1] != res[1]) {
+            printf("mismatch: t[1] != res[1]: %0llx != %0llx\n", t[1], res[1]);
+        }
+
+        mv[0] &= mv[0] - 1;
+        mv[1] &= mv[1] - 1;
+	m = and128(m, sub_2x64(m, set1_2x64(1)));
+        printf("x[%d] = %0llx\n", 0, xv[0]);
+        printf("x[%d] = %0llx\n", 1, xv[1]);
+        bitset = lshift64_m128(bitset, 1);
+    }
+    store128(res, vres);
+    printf("final x[%d] = %0llx\n", 0, res[0]);
+    printf("final x[%d] = %0llx\n", 1, res[1]);
+//    x = load128(res);
+    return vres;
+}
+
 static really_inline
 u32 expand32_impl(u32 x, u32 m) {
     return expand32_impl_c(x, m);
diff --git a/src/util/arch/common/bitutils.h b/src/util/arch/common/bitutils.h
index e86b8d44..88e71bba 100644
--- a/src/util/arch/common/bitutils.h
+++ b/src/util/arch/common/bitutils.h
@@ -35,6 +35,7 @@
 
 #include "util/popcount.h"
 #include "util/unaligned.h"
+#include "util/simd_utils.h"
 
 static really_inline
 u32 clz32_impl_c(u32 x) {
@@ -177,7 +178,13 @@ u32 compress32_impl_c(u32 x, u32 m) {
 
 static really_inline
 u64a compress64_impl_c(u64a x, u64a m) {
-    // Return zero quickly on trivial cases
+  u64a res = 0;
+  for (u64a bb = 1; m != 0; bb += bb) {
+    if (x & m & -m) { res |= bb; }
+    m &= (m - 1);
+  }
+  return res;
+/*    // Return zero quickly on trivial cases
     if ((x & m) == 0) {
         return 0;
     }
@@ -202,7 +209,20 @@ u64a compress64_impl_c(u64a x, u64a m) {
         mk = mk & ~mp;
     }
 
-    return x;
+    return x;*/
+}
+
+static really_inline
+m128 compress128_impl_c(m128 xvec, m128 mvec) {
+    u64a ALIGN_ATTR(16) x[2];
+    u64a ALIGN_ATTR(16) m[2];
+    store128(x, xvec);
+    store128(m, mvec);
+
+    compress64_impl_c(x[0], m[0]);
+    compress64_impl_c(x[1], m[1]);
+
+    return xvec;
 }
 
 static really_inline
@@ -242,7 +262,13 @@ u32 expand32_impl_c(u32 x, u32 m) {
 static really_inline
 u64a expand64_impl_c(u64a x, u64a m) {
 
-    // Return zero quickly on trivial cases
+  u64a res = 0;
+  for (u64a bb = 1; m != 0; bb += bb) {
+    if (x & bb) { res |= m & (-m); }
+    m &= (m - 1);
+  }
+  return res;
+/*    // Return zero quickly on trivial cases
     if (!x || !m) {
         return 0;
     }
@@ -272,7 +298,7 @@ u64a expand64_impl_c(u64a x, u64a m) {
         x = (x & ~mv) | (t & mv);
     }
 
-    return x & m0; // clear out extraneous bits
+    return x & m0; // clear out extraneous bits*/
 }
 
 
diff --git a/src/util/arch/x86/bitutils.h b/src/util/arch/x86/bitutils.h
index ec4c95ad..a0769a5e 100644
--- a/src/util/arch/x86/bitutils.h
+++ b/src/util/arch/x86/bitutils.h
@@ -214,6 +214,11 @@ u64a compress64_impl(u64a x, u64a m) {
 #endif
 }
 
+static really_inline
+u64a compress128_impl(m128 x, m128 m) {
+    compress128_impl_c(x, m);
+}
+
 static really_inline
 u32 expand32_impl(u32 x, u32 m) {
 #if defined(HAVE_BMI2)
diff --git a/src/util/bitutils.h b/src/util/bitutils.h
index 556ba818..21d35388 100644
--- a/src/util/bitutils.h
+++ b/src/util/bitutils.h
@@ -120,6 +120,11 @@ u64a compress64(u64a x, u64a m) {
     return compress64_impl(x, m);
 }
 
+static really_inline
+m128 compress128(m128 x, m128 m) {
+    return compress128_impl(x, m);
+}
+
 static really_inline
 u32 expand32(u32 x, u32 m) {
     return expand32_impl(x, m);

From 501f60e930f57f14010ca776677f4588e1f3362c Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Thu, 5 Nov 2020 19:20:37 +0200
Subject: [PATCH 40/53] add some debug info

---
 src/util/state_compress.c | 38 ++++++++++++++++++++++++++++++++------
 1 file changed, 32 insertions(+), 6 deletions(-)

diff --git a/src/util/state_compress.c b/src/util/state_compress.c
index fa07eb2b..586e47f4 100644
--- a/src/util/state_compress.c
+++ b/src/util/state_compress.c
@@ -107,21 +107,29 @@ void storecompressed128_32bit(void *ptr, m128 xvec, m128 mvec) {
 #if defined(ARCH_64_BIT)
 static really_inline
 void storecompressed128_64bit(void *ptr, m128 xvec, m128 mvec) {
+    printf("storecompressed128_64bit()\n");
     // First, decompose our vectors into 64-bit chunks.
+/*    u64a x[2];
+    memcpy(x, &xvec, sizeof(xvec));
+    u64a m[2];
+    memcpy(m, &mvec, sizeof(mvec));*/
     u64a ALIGN_ATTR(16) x[2];
     u64a ALIGN_ATTR(16) m[2];
-    store128(x, xvec);
     store128(m, mvec);
+    store128(x, xvec);
 
     // Count the number of bits of compressed state we're writing out per
     // chunk.
-    u32 bits[2] = { popcount64(m[0]), popcount64(m[1]) };
+    u32 ALIGN_ATTR(16) bits[2] = { popcount64(m[0]), popcount64(m[1]) };
+    //m128 vbits = load128(bits);
 
     // Compress each 64-bit chunk individually.
-    u64a v[2] = { compress64(x[0], m[0]), compress64(x[1], m[1]) };
+    //u64a v[2] = { compress64(x[0], m[0]), compress64(x[1], m[1]) };
+    xvec = compress128(xvec, mvec);
+    store128(x, xvec);
 
     // Write packed data out.
-    pack_bits_64(ptr, v, bits, 2);
+    pack_bits_64(ptr, x, bits, 2);
 }
 #endif
 
@@ -157,15 +165,33 @@ m128 loadcompressed128_32bit(const void *ptr, m128 mvec) {
 #if defined(ARCH_64_BIT)
 static really_inline
 m128 loadcompressed128_64bit(const void *ptr, m128 mvec) {
+    printf("loadcompressed128_64bit()\n");
     // First, decompose our vectors into 64-bit chunks.
-    u64a m[2] = { movq(mvec), movq(rshiftbyte_m128(mvec, 8)) };
+    u64a ALIGN_ATTR(16) m[2];
+    store128(m, mvec);
+    printf("m[0] = %0llx\n", m[0]);
+    printf("m[1] = %0llx\n", m[1]);
+
+//    m[0] = movq(mvec);
+//    m[1] = movq(rshiftbyte_m128(mvec, 8));
+    //store128(m, mvec);
+//    printf("m[0] = %0llx\n", m[0]);
+//    printf("m[1] = %0llx\n", m[1]);
 
     u32 bits[2] = { popcount64(m[0]), popcount64(m[1]) };
-    u64a v[2];
+    u64a ALIGN_ATTR(16) v[2];
+
+    printf("bits[0] = %0x\n", bits[0]);
+    printf("bits[1] = %0x\n", bits[1]);
 
     unpack_bits_64(v, (const u8 *)ptr, bits, 2);
+    printf("v[0] = %0llx\n", v[0]);
+    printf("v[1] = %0llx\n", v[1]);
 
     u64a x[2] = { expand64(v[0], m[0]), expand64(v[1], m[1]) };
+    printf("x[0] = %0llx\n", x[0]);
+    printf("x[1] = %0llx\n", x[1]);
+
 
     return set2x64(x[1], x[0]);
 }

From 62fed20ad051848c39d735900b978ffe261a51d3 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Thu, 5 Nov 2020 19:21:16 +0200
Subject: [PATCH 41/53] add some debug and minor optimizations in unit test

---
 unit/internal/state_compress.cpp | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/unit/internal/state_compress.cpp b/unit/internal/state_compress.cpp
index 56be8aae..00423702 100644
--- a/unit/internal/state_compress.cpp
+++ b/unit/internal/state_compress.cpp
@@ -98,8 +98,8 @@ TEST(state_compress, m128_1) {
     char buf[sizeof(m128)] = { 0 };
 
     for (u32 i = 0; i < 16; i++) {
-        char mask_raw[16] = { 0 };
-        char val_raw[16] = { 0 };
+        char ALIGN_ATTR(16) mask_raw[16] = { 0 };
+        char ALIGN_ATTR(16) val_raw[16] = { 0 };
 
         memset(val_raw, (i << 4) + 3, 16);
 
@@ -109,17 +109,32 @@ TEST(state_compress, m128_1) {
         mask_raw[15 - i] = 0xff;
         val_raw[15 - i] = i;
 
-        m128 val;
-        m128 mask;
-
-        memcpy(&val, val_raw, sizeof(val));
-        memcpy(&mask, mask_raw, sizeof(mask));
+        m128 val = load128(val_raw);
+        m128 mask = load128(mask_raw);
 
         storecompressed128(&buf, &val, &mask, 0);
 
         m128 val_out;
         loadcompressed128(&val_out, &buf, &mask, 0);
 
+        int8_t ALIGN_ATTR(16) data[16];
+	store128(data, val);
+	printf("val: ");
+	for (int j=0; j < 16; j++) printf("%02x ", data[j]);
+	printf("\n");
+	store128(data, mask);
+	printf("mask: ");
+	for (int j=0; j < 16; j++) printf("%02x ", data[j]);
+	printf("\n");
+	store128(data, and128(val, mask));
+	printf("and128(val, mask): ");
+	for (int j=0; j < 16; j++) printf("%02x ", data[j]);
+	printf("\n");
+	store128(data, val_out);
+	printf("val_out: ");
+	for (int j=0; j < 16; j++) printf("%02x ", data[j]);
+	printf("\n");
+
         EXPECT_TRUE(!diff128(and128(val, mask), val_out));
 
         mask_raw[i] = 0x0f;

From c4f1372814235f3eead54bdcc639dc6a2028a501 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Thu, 5 Nov 2020 20:33:17 +0200
Subject: [PATCH 42/53] remove debug from functions

---
 src/util/arch/arm/bitutils.h | 84 +-----------------------------------
 src/util/arch/x86/bitutils.h |  1 -
 src/util/state_compress.c    | 22 ----------
 3 files changed, 1 insertion(+), 106 deletions(-)

diff --git a/src/util/arch/arm/bitutils.h b/src/util/arch/arm/bitutils.h
index 0b579dc9..1d1e0167 100644
--- a/src/util/arch/arm/bitutils.h
+++ b/src/util/arch/arm/bitutils.h
@@ -107,102 +107,20 @@ u64a compress64_impl(u64a x, u64a m) {
 static really_inline
 m128 compress128_impl(m128 x, m128 m) {
 
-/*    x = and128(x, m); // clear irrelevant bits
-
-    // Return zero quickly on trivial cases
-    if (diff128(x, zeroes128()) == 0) {
-        return zeroes128();
-    }*/
-
-
-    u64a ALIGN_ATTR(16) xv[2];
-    u64a ALIGN_ATTR(16) mv[2];
-    u64a ALIGN_ATTR(16) res[2];
-    u64a ALIGN_ATTR(16) t[2];
-    u64a ALIGN_ATTR(16) bbv[2];
-    store128(xv, x);
-    store128(mv, m);
-    res[0] = 0;
-    res[1] = 0;
-    printf("x[%d] = %0llx\n", 0, xv[0]);
-    printf("x[%d] = %0llx\n", 1, xv[1]);
-
     m128 one = set1_2x64(1);
     m128 bitset = one;
     m128 vres = zeroes128();
-    for (u64a bb = 1; mv[0] | mv[1]; bb <<= 1) {
-        printf("bb = %lld\n", bb);
-	store128(bbv, bitset);
-        printf("bb[%d] = %0lld\n", 0, bbv[0]);
-        printf("bb[%d] = %0lld\n", 1, bbv[1]);
-        printf("m[%d] = %0llx\n", 0, mv[0]);
-        printf("m[%d] = %0llx\n", 1, mv[1]);
-        printf("scalar: -m[%d] = %0llx\n", 0, -mv[0]);
-        printf("scalar: -m[%d] = %0llx\n", 1, -mv[1]);
+    while (isnonzero128(m)) {
 	m128 mm = sub_2x64(zeroes128(), m);
-	store128(t, mm);
-        printf("vector: -m[0] = %0llx\n", t[0]);
-        printf("vector: -m[1] = %0llx\n", t[1]);
 	m128 tv = and128(x, m);
-	store128(t, tv);
-        printf("vector: x[0] & m[0] = %0llx\n", t[0]);
-        printf("vector: x[1] & m[1] = %0llx\n", t[1]);
 	tv = and128(tv, mm);
-	store128(t, tv);
-        printf("vector: x[0] & m[0] & -m[0] = %0llx\n", t[0]);
-        printf("vector: x[1] & m[1] & -m[1] = %0llx\n", t[1]);
-        t[0] = xv[0] & mv[0];
-        t[1] = xv[1] & mv[1];
-        printf("scalar: x[0] & m[0] = %0llx\n", t[0]);
-        printf("scalar: x[1] & m[1] = %0llx\n", t[1]);
-        t[0] = xv[0] & mv[0] & -mv[0];
-        t[1] = xv[1] & mv[1] & -mv[1];
-        printf("scalar: x[0] & m[0] & -m[0] = %0llx\n", t[0]);
-        printf("scalar: x[1] & m[1] & -m[1] = %0llx\n", t[1]);
-        
-        if ( t[0] ) {
-            printf("x & m & -m != 0\n");
-            res[0] |= bb;
-            printf("x[%d] = %0llx\n", 0, xv[0]);
-        }
-        if ( t[1] ) {
-            printf("x & m & -m != 0\n");
-            res[1] |= bb;
-            printf("x[%d] = %0llx\n", 1, xv[1]);
-        }
 
 	m128 mask = not128(eq64_m128(tv, zeroes128()));
-	store128(t, mask);
-        printf("mask: x[0] & m[0] & -m[0] != 0 : %0llx\n", t[0]);
-        printf("mask: x[1] & m[1] & -m[1] != 0 : %0llx\n", t[1]);
-
 	mask = vandq_s64(bitset, mask);
-	store128(t, mask);
-        printf("mask: mask[0] & bitset[1] != 0 : %0llx\n", t[0]);
-        printf("mask: mask[1] & bitset[1] != 0 : %0llx\n", t[1]);
-
         vres = or128(vres, mask);
-	store128(t, vres);
-        printf("res: res[0] != 0 : %0llx\n", t[0]);
-        printf("res: res[1] != 0 : %0llx\n", t[1]);
-	if (t[0] != res[0]) {
-            printf("mismatch: t[0] != res[0]: %0llx != %0llx\n", t[0], res[0]);
-        }
-	if (t[1] != res[1]) {
-            printf("mismatch: t[1] != res[1]: %0llx != %0llx\n", t[1], res[1]);
-        }
-
-        mv[0] &= mv[0] - 1;
-        mv[1] &= mv[1] - 1;
 	m = and128(m, sub_2x64(m, set1_2x64(1)));
-        printf("x[%d] = %0llx\n", 0, xv[0]);
-        printf("x[%d] = %0llx\n", 1, xv[1]);
         bitset = lshift64_m128(bitset, 1);
     }
-    store128(res, vres);
-    printf("final x[%d] = %0llx\n", 0, res[0]);
-    printf("final x[%d] = %0llx\n", 1, res[1]);
-//    x = load128(res);
     return vres;
 }
 
diff --git a/src/util/arch/x86/bitutils.h b/src/util/arch/x86/bitutils.h
index a0769a5e..424ad957 100644
--- a/src/util/arch/x86/bitutils.h
+++ b/src/util/arch/x86/bitutils.h
@@ -239,7 +239,6 @@ u64a expand64_impl(u64a x, u64a m) {
 #endif
 }
 
-
 /* returns the first set bit after begin (if not ~0U). If no bit is set after
  * begin returns ~0U
  */
diff --git a/src/util/state_compress.c b/src/util/state_compress.c
index 586e47f4..360ec39e 100644
--- a/src/util/state_compress.c
+++ b/src/util/state_compress.c
@@ -109,10 +109,6 @@ static really_inline
 void storecompressed128_64bit(void *ptr, m128 xvec, m128 mvec) {
     printf("storecompressed128_64bit()\n");
     // First, decompose our vectors into 64-bit chunks.
-/*    u64a x[2];
-    memcpy(x, &xvec, sizeof(xvec));
-    u64a m[2];
-    memcpy(m, &mvec, sizeof(mvec));*/
     u64a ALIGN_ATTR(16) x[2];
     u64a ALIGN_ATTR(16) m[2];
     store128(m, mvec);
@@ -121,10 +117,8 @@ void storecompressed128_64bit(void *ptr, m128 xvec, m128 mvec) {
     // Count the number of bits of compressed state we're writing out per
     // chunk.
     u32 ALIGN_ATTR(16) bits[2] = { popcount64(m[0]), popcount64(m[1]) };
-    //m128 vbits = load128(bits);
 
     // Compress each 64-bit chunk individually.
-    //u64a v[2] = { compress64(x[0], m[0]), compress64(x[1], m[1]) };
     xvec = compress128(xvec, mvec);
     store128(x, xvec);
 
@@ -169,29 +163,13 @@ m128 loadcompressed128_64bit(const void *ptr, m128 mvec) {
     // First, decompose our vectors into 64-bit chunks.
     u64a ALIGN_ATTR(16) m[2];
     store128(m, mvec);
-    printf("m[0] = %0llx\n", m[0]);
-    printf("m[1] = %0llx\n", m[1]);
-
-//    m[0] = movq(mvec);
-//    m[1] = movq(rshiftbyte_m128(mvec, 8));
-    //store128(m, mvec);
-//    printf("m[0] = %0llx\n", m[0]);
-//    printf("m[1] = %0llx\n", m[1]);
 
     u32 bits[2] = { popcount64(m[0]), popcount64(m[1]) };
     u64a ALIGN_ATTR(16) v[2];
 
-    printf("bits[0] = %0x\n", bits[0]);
-    printf("bits[1] = %0x\n", bits[1]);
-
     unpack_bits_64(v, (const u8 *)ptr, bits, 2);
-    printf("v[0] = %0llx\n", v[0]);
-    printf("v[1] = %0llx\n", v[1]);
 
     u64a x[2] = { expand64(v[0], m[0]), expand64(v[1], m[1]) };
-    printf("x[0] = %0llx\n", x[0]);
-    printf("x[1] = %0llx\n", x[1]);
-
 
     return set2x64(x[1], x[0]);
 }

From 606c53a05f1d6d36d6088cafccd384c94d7fa4d5 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Tue, 24 Nov 2020 17:55:03 +0200
Subject: [PATCH 43/53] fix compiler flag testcase

---
 cmake/arch.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cmake/arch.cmake b/cmake/arch.cmake
index e3cc9f44..cb73ff49 100644
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@@ -78,6 +78,7 @@ elseif (ARCH_ARM32 OR ARCH_AARCH64)
     CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
 int main() {
     int32x4_t a = vdupq_n_s32(1);
+    (void)a;
 }" HAVE_NEON)
 else ()
     message (FATAL_ERROR "Unsupported architecture")

From 1c26f044a73491baa078b186ddc4cb2c4c8c7222 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Tue, 24 Nov 2020 17:56:40 +0200
Subject: [PATCH 44/53] when building in debug mode, vgetq_lane_*() and
 vextq_*() need immediate operands, and we have to use switch()'ed versions

---
 src/util/arch/arm/simd_utils.h | 63 +++++++++++++++++++++++++++++++++-
 1 file changed, 62 insertions(+), 1 deletion(-)

diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h
index 7c5d11d5..232ca76f 100644
--- a/src/util/arch/arm/simd_utils.h
+++ b/src/util/arch/arm/simd_utils.h
@@ -161,11 +161,45 @@ m128 load_m128_from_u64a(const u64a *p) {
 }
 
 static really_inline u32 extract32from128(const m128 in, unsigned imm) {
+#if !defined(DEBUG)
     return vgetq_lane_u32((uint32x4_t) in, imm);
+#else
+    switch (imm) {
+    case 0:
+        return vgetq_lane_u32((uint32x4_t) in, 0);
+	break;
+    case 1:
+        return vgetq_lane_u32((uint32x4_t) in, 1);
+	break;
+    case 2:
+        return vgetq_lane_u32((uint32x4_t) in, 2);
+	break;
+    case 3:
+        return vgetq_lane_u32((uint32x4_t) in, 3);
+	break;
+    default:
+	return 0;
+	break;
+    }
+#endif
 }
 
-static really_inline u32 extract64from128(const m128 in, unsigned imm) {
+static really_inline u64a extract64from128(const m128 in, unsigned imm) {
+#if !defined(DEBUG)
     return vgetq_lane_u64((uint64x2_t) in, imm);
+#else
+    switch (imm) {
+    case 0:
+        return vgetq_lane_u64((uint32x4_t) in, 0);
+	break;
+    case 1:
+        return vgetq_lane_u64((uint32x4_t) in, 1);
+	break;
+    default:
+	return 0;
+	break;
+    }
+#endif
 }
 
 static really_inline m128 and128(m128 a, m128 b) {
@@ -278,10 +312,37 @@ char testbit128(m128 val, unsigned int n) {
     return isnonzero128(and128(mask, val));
 }
 
+#define CASE_ALIGN_VECTORS(a, b, offset)  case offset: return (m128)vextq_s8((int8x16_t)(a), (int8x16_t)(b), (offset)); break;
+
 static really_inline
 m128 palignr(m128 r, m128 l, int offset) {
+#if !defined(DEBUG)
     return (m128)vextq_s8((int8x16_t)l, (int8x16_t)r, offset);
+#else
+    switch (offset) {
+    CASE_ALIGN_VECTORS(l, r, 0);
+    CASE_ALIGN_VECTORS(l, r, 1);
+    CASE_ALIGN_VECTORS(l, r, 2);
+    CASE_ALIGN_VECTORS(l, r, 3);
+    CASE_ALIGN_VECTORS(l, r, 4);
+    CASE_ALIGN_VECTORS(l, r, 5);
+    CASE_ALIGN_VECTORS(l, r, 6);
+    CASE_ALIGN_VECTORS(l, r, 7);
+    CASE_ALIGN_VECTORS(l, r, 8);
+    CASE_ALIGN_VECTORS(l, r, 9);
+    CASE_ALIGN_VECTORS(l, r, 10);
+    CASE_ALIGN_VECTORS(l, r, 11);
+    CASE_ALIGN_VECTORS(l, r, 12);
+    CASE_ALIGN_VECTORS(l, r, 13);
+    CASE_ALIGN_VECTORS(l, r, 14);
+    CASE_ALIGN_VECTORS(l, r, 15);
+    default:
+	return zeroes128();
+	break;
+    }
+#endif
 }
+#undef CASE_ALIGN_VECTORS
 
 static really_inline
 m128 pshufb_m128(m128 a, m128 b) {

From d76365240bd56ce981887e991f075839b5549aaf Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Tue, 24 Nov 2020 17:57:16 +0200
Subject: [PATCH 45/53] helper functions to print a m128 vector in debug mode

---
 src/util/arch/common/simd_utils.h | 38 +++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/src/util/arch/common/simd_utils.h b/src/util/arch/common/simd_utils.h
index c16023ac..39cb91f0 100644
--- a/src/util/arch/common/simd_utils.h
+++ b/src/util/arch/common/simd_utils.h
@@ -44,6 +44,44 @@
 #error "You need at least a 128-bit capable SIMD engine!"
 #endif // HAVE_SIMD_128_BITS
 
+#ifdef DEBUG
+static inline void print_m128_16x8(char *label, m128 vector) {
+    uint8_t __attribute__((aligned(16))) data[16];
+    store128(data, vector);
+    DEBUG_PRINTF("%s: ", label);
+    for(int i=0; i < 16; i++)
+        printf("%02x ", data[i]);
+    printf("\n");
+}
+
+static inline void print_m128_8x16(char *label, m128 vector) {
+    uint16_t __attribute__((aligned(16))) data[8];
+    store128(data, vector);
+    DEBUG_PRINTF("%s: ", label);
+    for(int i=0; i < 8; i++)
+        printf("%04x ", data[i]);
+    printf("\n");
+}
+
+static inline void print_m128_4x32(char *label, m128 vector) {
+    uint32_t __attribute__((aligned(16))) data[4];
+    store128(data, vector);
+    DEBUG_PRINTF("%s: ", label);
+    for(int i=0; i < 4; i++)
+        printf("%08x ", data[i]);
+    printf("\n");
+}
+
+static inline void print_m128_2x64(char *label, m128 vector) {
+    uint64_t __attribute__((aligned(16))) data[2];
+    store128(data, vector);
+    DEBUG_PRINTF("%s: ", label);
+    for(int i=0; i < 2; i++)
+        printf("%016lx ", data[i]);
+    printf("\n");
+}
+#endif
+
 /****
  **** 256-bit Primitives
  ****/

From 17ab42d8910d1c419f1c10ef1b3884c0d5a547c5 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Tue, 24 Nov 2020 17:59:42 +0200
Subject: [PATCH 46/53] small optimization that was for some reason failing in
 ARM, should be faster anyway

---
 src/fdr/teddy.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/fdr/teddy.c b/src/fdr/teddy.c
index 97cff0b4..16947c61 100644
--- a/src/fdr/teddy.c
+++ b/src/fdr/teddy.c
@@ -901,8 +901,10 @@ do {                                                                          \
 #define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn)                 \
 do {                                                                        \
     if (unlikely(diff128(var, ones128()))) {                                \
-        u64a lo = movq(var);                                                \
-        u64a hi = movq(rshiftbyte_m128(var, 8));                            \
+        u64a __attribute__((aligned(16))) vector[2];                        \
+        store128(vector, var);                                              \
+        u64a lo = vector[0];                                                \
+        u64a hi = vector[1];                                                \
         CONF_CHUNK_64(lo, bucket, offset, reason, conf_fn);                 \
         CONF_CHUNK_64(hi, bucket, offset + 8, reason, conf_fn);             \
     }                                                                       \

From 259c2572c15a10d5316dc51d8a3cf4e22ebfe793 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Thu, 3 Dec 2020 19:27:05 +0200
Subject: [PATCH 47/53] define debug vector print functions to NULL in
 non-debug mode

---
 src/util/arch/common/simd_utils.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/util/arch/common/simd_utils.h b/src/util/arch/common/simd_utils.h
index 39cb91f0..0c67ee94 100644
--- a/src/util/arch/common/simd_utils.h
+++ b/src/util/arch/common/simd_utils.h
@@ -80,6 +80,11 @@ static inline void print_m128_2x64(char *label, m128 vector) {
         printf("%016lx ", data[i]);
     printf("\n");
 }
+#else
+#define print_m128_16x8(label, vector) NULL
+#define print_m128_8x16(label, vector) NULL
+#define print_m128_4x32(label, vector) NULL
+#define print_m128_2x64(label, vector) NULL
 #endif
 
 /****

From 38477b08bc286ad1eec77fabd981d4545257590f Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Thu, 3 Dec 2020 19:27:38 +0200
Subject: [PATCH 48/53] fix movq and load_m128_from_u64a and resp. test for
 NEON

---
 src/util/arch/arm/simd_utils.h | 4 ++--
 unit/internal/simd_utils.cpp   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h
index 232ca76f..c918eced 100644
--- a/src/util/arch/arm/simd_utils.h
+++ b/src/util/arch/arm/simd_utils.h
@@ -151,13 +151,13 @@ static really_inline u32 movd(const m128 in) {
 }
 
 static really_inline u64a movq(const m128 in) {
-    return vgetq_lane_u64((uint64x2_t) in, 1);
+    return vgetq_lane_u64((uint64x2_t) in, 0);
 }
 
 /* another form of movq */
 static really_inline
 m128 load_m128_from_u64a(const u64a *p) {
-    return (m128) vdupq_n_u64(*p);
+    return (m128) vsetq_lane_u64(*p, zeroes128(), 0);
 }
 
 static really_inline u32 extract32from128(const m128 in, unsigned imm) {
diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp
index 5c0e0b40..bc1426b1 100644
--- a/unit/internal/simd_utils.cpp
+++ b/unit/internal/simd_utils.cpp
@@ -667,7 +667,7 @@ TEST(SimdUtilsTest, movq) {
 #if defined(ARCH_IA32) || defined(ARCH_X86_64)
     simd = _mm_set_epi64x(~0LL, 0x123456789abcdef);
 #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
-    int64x2_t a = { ~0LL, 0x123456789abcdefLL };
+    int64x2_t a = { 0x123456789abcdefLL, ~0LL };
     simd = vreinterpretq_s64_s8(a);
 #endif
 #endif

From c38722a68b07436a14f9daa8ba8b50548ff3c9f0 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Thu, 3 Dec 2020 19:27:58 +0200
Subject: [PATCH 49/53] add ARM platform

---
 src/database.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/database.h b/src/database.h
index 5715ed67..7789b9ab 100644
--- a/src/database.h
+++ b/src/database.h
@@ -51,6 +51,7 @@ extern "C"
 // CPU type is the low 6 bits (we can't need more than 64, surely!)
 
 #define HS_PLATFORM_INTEL           1
+#define HS_PLATFORM_ARM             2
 #define HS_PLATFORM_CPU_MASK        0x3F
 
 #define HS_PLATFORM_NOAVX2          (4<<13)

From 39945b7775ebbe4d6bed86c475260db9bd87eb25 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Thu, 3 Dec 2020 19:30:50 +0200
Subject: [PATCH 50/53] clear zones array

---
 src/fdr/fdr.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c
index b0f90b52..1a3b7003 100644
--- a/src/fdr/fdr.c
+++ b/src/fdr/fdr.c
@@ -726,6 +726,7 @@ hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
     assert(ISALIGNED_CL(confBase));
     struct zone zones[ZONE_MAX];
     assert(fdr->domain > 8 && fdr->domain < 16);
+    memset(zones, 0, sizeof(zones));
 
     size_t numZone = prepareZones(a->buf, a->len,
                                   a->buf_history + a->len_history,

From 773dc6fa69ff1ab28317a99966a057ad7006c6ad Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Mon, 7 Dec 2020 23:12:26 +0200
Subject: [PATCH 51/53] optimize *shiftbyte_m128() functions to use palign
 instead of variable_byte_shift_m128()

---
 src/util/arch/arm/simd_utils.h | 78 ++++++++++++++++++----------------
 1 file changed, 42 insertions(+), 36 deletions(-)

diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h
index c918eced..f7b92e70 100644
--- a/src/util/arch/arm/simd_utils.h
+++ b/src/util/arch/arm/simd_utils.h
@@ -161,7 +161,7 @@ m128 load_m128_from_u64a(const u64a *p) {
 }
 
 static really_inline u32 extract32from128(const m128 in, unsigned imm) {
-#if !defined(DEBUG)
+#if defined(HS_OPTIMIZE)
     return vgetq_lane_u32((uint32x4_t) in, imm);
 #else
     switch (imm) {
@@ -185,7 +185,7 @@ static really_inline u32 extract32from128(const m128 in, unsigned imm) {
 }
 
 static really_inline u64a extract64from128(const m128 in, unsigned imm) {
-#if !defined(DEBUG)
+#if defined(HS_OPTIMIZE)
     return vgetq_lane_u64((uint64x2_t) in, imm);
 #else
     switch (imm) {
@@ -265,14 +265,52 @@ m128 variable_byte_shift_m128(m128 in, s32 amount) {
     return vqtbl1q_s8(in, shift_mask);
 }
 
+#define CASE_ALIGN_VECTORS(a, b, offset)  case offset: return (m128)vextq_s8((int8x16_t)(a), (int8x16_t)(b), (offset)); break;
+
+static really_inline
+m128 palignr(m128 r, m128 l, int offset) {
+#if defined(HS_OPTIMIZE)
+        return (m128)vextq_s8((int8x16_t)l, (int8x16_t)r, offset);
+#else
+    switch (offset) {
+    CASE_ALIGN_VECTORS(l, r, 0);
+    CASE_ALIGN_VECTORS(l, r, 1);
+    CASE_ALIGN_VECTORS(l, r, 2);
+    CASE_ALIGN_VECTORS(l, r, 3);
+    CASE_ALIGN_VECTORS(l, r, 4);
+    CASE_ALIGN_VECTORS(l, r, 5);
+    CASE_ALIGN_VECTORS(l, r, 6);
+    CASE_ALIGN_VECTORS(l, r, 7);
+    CASE_ALIGN_VECTORS(l, r, 8);
+    CASE_ALIGN_VECTORS(l, r, 9);
+    CASE_ALIGN_VECTORS(l, r, 10);
+    CASE_ALIGN_VECTORS(l, r, 11);
+    CASE_ALIGN_VECTORS(l, r, 12);
+    CASE_ALIGN_VECTORS(l, r, 13);
+    CASE_ALIGN_VECTORS(l, r, 14);
+    CASE_ALIGN_VECTORS(l, r, 15);
+    default:
+	return zeroes128();
+	break;
+    }
+#endif
+}
+#undef CASE_ALIGN_VECTORS
+
 static really_really_inline
 m128 rshiftbyte_m128(m128 a, unsigned b) {
-    return variable_byte_shift_m128(a, -b);;
+    if (b)
+        return palignr(zeroes128(), a, b);
+    else
+        return a;
 }
 
 static really_really_inline
 m128 lshiftbyte_m128(m128 a, unsigned b) {
-    return variable_byte_shift_m128(a, b);;
+    if (b)
+        return palignr(a, zeroes128(), 16 - b);
+    else
+        return a;
 }
 
 
@@ -312,38 +350,6 @@ char testbit128(m128 val, unsigned int n) {
     return isnonzero128(and128(mask, val));
 }
 
-#define CASE_ALIGN_VECTORS(a, b, offset)  case offset: return (m128)vextq_s8((int8x16_t)(a), (int8x16_t)(b), (offset)); break;
-
-static really_inline
-m128 palignr(m128 r, m128 l, int offset) {
-#if !defined(DEBUG)
-    return (m128)vextq_s8((int8x16_t)l, (int8x16_t)r, offset);
-#else
-    switch (offset) {
-    CASE_ALIGN_VECTORS(l, r, 0);
-    CASE_ALIGN_VECTORS(l, r, 1);
-    CASE_ALIGN_VECTORS(l, r, 2);
-    CASE_ALIGN_VECTORS(l, r, 3);
-    CASE_ALIGN_VECTORS(l, r, 4);
-    CASE_ALIGN_VECTORS(l, r, 5);
-    CASE_ALIGN_VECTORS(l, r, 6);
-    CASE_ALIGN_VECTORS(l, r, 7);
-    CASE_ALIGN_VECTORS(l, r, 8);
-    CASE_ALIGN_VECTORS(l, r, 9);
-    CASE_ALIGN_VECTORS(l, r, 10);
-    CASE_ALIGN_VECTORS(l, r, 11);
-    CASE_ALIGN_VECTORS(l, r, 12);
-    CASE_ALIGN_VECTORS(l, r, 13);
-    CASE_ALIGN_VECTORS(l, r, 14);
-    CASE_ALIGN_VECTORS(l, r, 15);
-    default:
-	return zeroes128();
-	break;
-    }
-#endif
-}
-#undef CASE_ALIGN_VECTORS
-
 static really_inline
 m128 pshufb_m128(m128 a, m128 b) {
     /* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf.

From e088c6ae2b87b771552d7c7b2e1ca1db2062beb1 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Mon, 7 Dec 2020 23:12:41 +0200
Subject: [PATCH 52/53] remove forgotten printf

---
 src/util/state_compress.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/util/state_compress.c b/src/util/state_compress.c
index 360ec39e..5c26f043 100644
--- a/src/util/state_compress.c
+++ b/src/util/state_compress.c
@@ -107,7 +107,6 @@ void storecompressed128_32bit(void *ptr, m128 xvec, m128 mvec) {
 #if defined(ARCH_64_BIT)
 static really_inline
 void storecompressed128_64bit(void *ptr, m128 xvec, m128 mvec) {
-    printf("storecompressed128_64bit()\n");
     // First, decompose our vectors into 64-bit chunks.
     u64a ALIGN_ATTR(16) x[2];
     u64a ALIGN_ATTR(16) m[2];
@@ -159,7 +158,6 @@ m128 loadcompressed128_32bit(const void *ptr, m128 mvec) {
 #if defined(ARCH_64_BIT)
 static really_inline
 m128 loadcompressed128_64bit(const void *ptr, m128 mvec) {
-    printf("loadcompressed128_64bit()\n");
     // First, decompose our vectors into 64-bit chunks.
     u64a ALIGN_ATTR(16) m[2];
     store128(m, mvec);

From 61b963a7179b4cd5f5774a45918c1b2db7805510 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <markos@freevec.org>
Date: Tue, 8 Dec 2020 11:42:30 +0200
Subject: [PATCH 53/53] fix x86 compilation

---
 src/util/arch/x86/bitutils.h   | 4 ++--
 src/util/arch/x86/simd_utils.h | 5 +----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/util/arch/x86/bitutils.h b/src/util/arch/x86/bitutils.h
index 424ad957..33fff7c2 100644
--- a/src/util/arch/x86/bitutils.h
+++ b/src/util/arch/x86/bitutils.h
@@ -215,8 +215,8 @@ u64a compress64_impl(u64a x, u64a m) {
 }
 
 static really_inline
-u64a compress128_impl(m128 x, m128 m) {
-    compress128_impl_c(x, m);
+m128 compress128_impl(m128 x, m128 m) {
+    return compress128_impl_c(x, m);
 }
 
 static really_inline
diff --git a/src/util/arch/x86/simd_utils.h b/src/util/arch/x86/simd_utils.h
index 4a1a691e..9555bf6c 100644
--- a/src/util/arch/x86/simd_utils.h
+++ b/src/util/arch/x86/simd_utils.h
@@ -33,10 +33,7 @@
 #ifndef ARCH_X86_SIMD_UTILS_H
 #define ARCH_X86_SIMD_UTILS_H
 
-#if !defined(_WIN32) && !defined(__SSSE3__)
-#error SSSE3 instructions must be enabled
-#endif
-
+#include "x86.h"
 #include "ue2common.h"
 #include "util/simd_types.h"
 #include "util/unaligned.h"