From d24d67c28b460fb5be4a8bca598dee20558dc55c Mon Sep 17 00:00:00 2001
From: Konstantnos Margaritis <konstantinos@vectorcamp.gr>
Date: Tue, 21 Nov 2023 17:06:22 +0000
Subject: [PATCH 01/22] Add SIMDe backend to CMake

---
 CMakeLists.txt | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 024acbaa..908b53fc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -128,6 +128,11 @@ elseif (ARCH_ARM32 OR ARCH_AARCH64)
 elseif (ARCH_PPC64EL)
     include (${CMAKE_MODULE_PATH}/cflags-ppc64le.cmake)
     set(ARCH_FLAG mcpu)
+elseif(SIMDE_BACKEND)
+    include (${CMAKE_MODULE_PATH}/simde.cmake)
+    set(ARCH_FLAG march)
+else()
+    message(FATAL_ERROR "Unsupported platform")
 endif ()
 
 # Detect Native arch flags if requested
@@ -253,6 +258,10 @@ elseif (ARCH_PPC64EL)
 set (hs_exec_common_SRCS
     ${hs_exec_common_SRCS}
     src/util/arch/ppc64el/cpuid_flags.c)
+elseif (SIMDE_BACKEND)
+set (hs_exec_common_SRCS
+    ${hs_exec_common_SRCS}
+    src/util/arch/simde/cpuid_flags.c)
 endif ()
 
 set (hs_exec_SRCS
@@ -411,6 +420,11 @@ set (hs_exec_SRCS
     ${hs_exec_SRCS}
     src/nfa/vermicelli_simd.cpp
     src/util/supervector/arch/ppc64el/impl.cpp)
+elseif (SIMDE_BACKEND)
+set (hs_exec_SRCS
+    ${hs_exec_SRCS}
+    src/nfa/vermicelli_simd.cpp
+    src/util/supervector/arch/simde/impl.cpp)
 endif()
 
 if (ARCH_IA32 OR ARCH_X86_64)

From 129015afc651ba8f01d12d464d40e75e8985144f Mon Sep 17 00:00:00 2001
From: Konstantnos Margaritis <konstantinos@vectorcamp.gr>
Date: Tue, 21 Nov 2023 17:09:24 +0000
Subject: [PATCH 02/22] add SIMDe git submodule

---
 .gitmodules | 6 ++++++
 simde       | 1 +
 2 files changed, 7 insertions(+)
 create mode 100644 .gitmodules
 create mode 160000 simde

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 00000000..f82d1abf
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,6 @@
+[submodule "build-simde/simde"]
+	path = build-simde/simde
+	url = https://github.com/simd-everywhere/simde.git
+[submodule "simde"]
+	path = simde
+	url = https://github.com/simd-everywhere/simde.git
diff --git a/simde b/simde
new file mode 160000
index 00000000..aae22459
--- /dev/null
+++ b/simde
@@ -0,0 +1 @@
+Subproject commit aae22459fa284e9fc2b7d4b8e4571afa0418125f

From 8455cba03dd09654ffd52c7e1dde218946ffe960 Mon Sep 17 00:00:00 2001
From: Konstantnos Margaritis <konstantinos@vectorcamp.gr>
Date: Tue, 21 Nov 2023 17:09:48 +0000
Subject: [PATCH 03/22] add SIMDe cmake file

---
 cmake/simde.cmake | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 cmake/simde.cmake

diff --git a/cmake/simde.cmake b/cmake/simde.cmake
new file mode 100644
index 00000000..b68c8e57
--- /dev/null
+++ b/cmake/simde.cmake
@@ -0,0 +1,5 @@
+include_directories(${PROJECT_SOURCE_DIR}/simde/simde)
+
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DSIMDE_BACKEND")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSIMDE_BACKEND")
+

From b5cde5ebf7543c4fada5406cc2677b4783b95a5e Mon Sep 17 00:00:00 2001
From: Konstantnos Margaritis <konstantinos@vectorcamp.gr>
Date: Tue, 21 Nov 2023 17:11:09 +0000
Subject: [PATCH 04/22] mofidied .gitmodules

---
 .gitmodules | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index f82d1abf..8dd6c091 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,3 @@
-[submodule "build-simde/simde"]
-	path = build-simde/simde
-	url = https://github.com/simd-everywhere/simde.git
 [submodule "simde"]
 	path = simde
 	url = https://github.com/simd-everywhere/simde.git

From b068087240c08e097e97fb5ed71f08169fcde8e9 Mon Sep 17 00:00:00 2001
From: Konstantnos Margaritis <konstantinos@vectorcamp.gr>
Date: Tue, 21 Nov 2023 17:12:04 +0000
Subject: [PATCH 05/22] add SIMDe ports of simd_utils and supervector

---
 src/util/arch/simde/cpuid_flags.c        |  41 ++
 src/util/arch/simde/simd_utils.h         | 377 ++++++++++++++++
 src/util/supervector/arch/simde/impl.cpp | 530 +++++++++++++++++++++++
 src/util/supervector/supervector.hpp     |   2 +
 4 files changed, 950 insertions(+)
 create mode 100644 src/util/arch/simde/cpuid_flags.c
 create mode 100644 src/util/arch/simde/simd_utils.h
 create mode 100644 src/util/supervector/arch/simde/impl.cpp

diff --git a/src/util/arch/simde/cpuid_flags.c b/src/util/arch/simde/cpuid_flags.c
new file mode 100644
index 00000000..a2f3758c
--- /dev/null
+++ b/src/util/arch/simde/cpuid_flags.c
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2023, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "util/arch/common/cpuid_flags.h"
+#include "ue2common.h"
+#include "hs_compile.h" // for HS_MODE_ flags
+#include "util/arch.h"
+
+u64a cpuid_flags(void) {
+     return 0;
+}
+
+u32 cpuid_tune(void) {
+    return HS_TUNE_FAMILY_GENERIC;
+}
diff --git a/src/util/arch/simde/simd_utils.h b/src/util/arch/simde/simd_utils.h
new file mode 100644
index 00000000..d241f87c
--- /dev/null
+++ b/src/util/arch/simde/simd_utils.h
@@ -0,0 +1,377 @@
+/*
+ * Copyright (c) 2015-2020, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief SIMD types and primitive operations.
+ */
+
+#ifndef ARCH_SIMDE_SIMD_UTILS_H
+#define ARCH_SIMDE_SIMD_UTILS_H
+
+#include "ue2common.h"
+#include "util/simd_types.h"
+#include "util/unaligned.h"
+#include "util/intrinsics.h"
+
+#include <string.h> // for memcpy
+
+#define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0
+#define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0
+#define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8
+
+/** \brief LUT for the mask1bit functions. */
+ALIGN_CL_DIRECTIVE static const u8 simd_onebit_masks[] = {
+    ZEROES_32, ZEROES_32,
+    ZEROES_31, 0x01, ZEROES_32,
+    ZEROES_31, 0x02, ZEROES_32,
+    ZEROES_31, 0x04, ZEROES_32,
+    ZEROES_31, 0x08, ZEROES_32,
+    ZEROES_31, 0x10, ZEROES_32,
+    ZEROES_31, 0x20, ZEROES_32,
+    ZEROES_31, 0x40, ZEROES_32,
+    ZEROES_31, 0x80, ZEROES_32,
+    ZEROES_32, ZEROES_32,
+};
+
+static really_inline m128 ones128(void) {
+    return (m128) _mm_set1_epi8(0xFF);
+}
+
+static really_inline m128 zeroes128(void) {
+    return (m128) _mm_setzero_si128();
+}
+
+/** \brief Bitwise not for m128*/
+static really_inline m128 not128(m128 a) {
+    return (m128) _mm_xor_si128(a, ones128());
+}
+
+/** \brief Return 1 if a and b are different otherwise 0 */
+static really_inline int diff128(m128 a, m128 b) {
+    return (_mm_movemask_epi8(_mm_cmpeq_epi8(a, b)) ^ 0xffff);
+}
+
+static really_inline int isnonzero128(m128 a) {
+    return !!diff128(a, zeroes128());
+}
+
+/**
+ * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit
+ * mask indicating which 32-bit words contain differences.
+ */
+static really_inline u32 diffrich128(m128 a, m128 b) {
+    a = _mm_cmpeq_epi32(a, b);
+    return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0xf;
+}
+
+/**
+ * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and
+ * returns a 4-bit mask indicating which 64-bit words contain differences.
+ */
+static really_inline u32 diffrich64_128(m128 a, m128 b) {
+    a = _mm_cmpeq_epi64(a, b);
+    return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0x5;
+}
+
+static really_really_inline
+m128 lshift64_m128(m128 a, unsigned b) {
+    return _mm_slli_epi64(a, b);
+}
+
+#define rshift64_m128(a, b) _mm_srli_epi64((a), (b))
+#define eq128(a, b)      _mm_cmpeq_epi8((a), (b))
+#define movemask128(a)  ((u32)_mm_movemask_epi8((a)))
+
+static really_inline m128 set1_16x8(u8 c) {
+    return _mm_set1_epi8(c);
+}
+
+static really_inline m128 set1_4x32(u32 c) {
+    return _mm_set1_epi32(c);
+}
+
+static really_inline m128 set1_2x64(u64a c) {
+    return _mm_set1_epi64x(c);
+}
+
+static really_inline u32 movd(const m128 in) {
+    return _mm_cvtsi128_si32(in);
+}
+
+static really_inline u64a movq(const m128 in) {
+    return _mm_cvtsi128_si64(in);
+}
+
+/* another form of movq */
+static really_inline
+m128 load_m128_from_u64a(const u64a *p) {
+    return _mm_set_epi64x(0LL, *p);
+}
+
+#define CASE_RSHIFT_VECTOR(a, count)  case count: return _mm_srli_si128((m128)(a), (count)); break;
+
+static really_inline
+m128 rshiftbyte_m128(const m128 a, int count_immed) {
+    switch (count_immed) {
+    case 0: return a; break;
+    CASE_RSHIFT_VECTOR(a, 1);
+    CASE_RSHIFT_VECTOR(a, 2);
+    CASE_RSHIFT_VECTOR(a, 3);
+    CASE_RSHIFT_VECTOR(a, 4);
+    CASE_RSHIFT_VECTOR(a, 5);
+    CASE_RSHIFT_VECTOR(a, 6);
+    CASE_RSHIFT_VECTOR(a, 7);
+    CASE_RSHIFT_VECTOR(a, 8);
+    CASE_RSHIFT_VECTOR(a, 9);
+    CASE_RSHIFT_VECTOR(a, 10);
+    CASE_RSHIFT_VECTOR(a, 11);
+    CASE_RSHIFT_VECTOR(a, 12);
+    CASE_RSHIFT_VECTOR(a, 13);
+    CASE_RSHIFT_VECTOR(a, 14);
+    CASE_RSHIFT_VECTOR(a, 15);
+    default: return zeroes128(); break;
+    }
+}
+#undef CASE_RSHIFT_VECTOR
+
+#define CASE_LSHIFT_VECTOR(a, count)  case count: return _mm_slli_si128((m128)(a), (count)); break;
+
+static really_inline
+m128 lshiftbyte_m128(const m128 a, int count_immed) {
+    switch (count_immed) {
+    case 0: return a; break;
+    CASE_LSHIFT_VECTOR(a, 1);
+    CASE_LSHIFT_VECTOR(a, 2);
+    CASE_LSHIFT_VECTOR(a, 3);
+    CASE_LSHIFT_VECTOR(a, 4);
+    CASE_LSHIFT_VECTOR(a, 5);
+    CASE_LSHIFT_VECTOR(a, 6);
+    CASE_LSHIFT_VECTOR(a, 7);
+    CASE_LSHIFT_VECTOR(a, 8);
+    CASE_LSHIFT_VECTOR(a, 9);
+    CASE_LSHIFT_VECTOR(a, 10);
+    CASE_LSHIFT_VECTOR(a, 11);
+    CASE_LSHIFT_VECTOR(a, 12);
+    CASE_LSHIFT_VECTOR(a, 13);
+    CASE_LSHIFT_VECTOR(a, 14);
+    CASE_LSHIFT_VECTOR(a, 15);
+    default: return zeroes128(); break;
+    }
+}
+#undef CASE_LSHIFT_VECTOR
+
+#define extract32from128(a, imm) _mm_extract_epi32(a, imm)
+#define extract64from128(a, imm) _mm_extract_epi64(a, imm)
+
+static really_inline m128 add128(m128 a, m128 b) {
+    return _mm_add_epi64(a, b);
+}
+
+static really_inline m128 and128(m128 a, m128 b) {
+    return _mm_and_si128(a,b);
+}
+
+static really_inline m128 xor128(m128 a, m128 b) {
+    return _mm_xor_si128(a,b);
+}
+
+static really_inline m128 or128(m128 a, m128 b) {
+    return _mm_or_si128(a,b);
+}
+
+static really_inline m128 andnot128(m128 a, m128 b) {
+    return _mm_andnot_si128(a, b);
+}
+
+// aligned load
+static really_inline m128 load128(const void *ptr) {
+    assert(ISALIGNED_N(ptr, alignof(m128)));
+    ptr = vectorscan_assume_aligned(ptr, 16);
+    return _mm_load_si128((const m128 *)ptr);
+}
+
+// aligned store
+static really_inline void store128(void *ptr, m128 a) {
+    assert(ISALIGNED_N(ptr, alignof(m128)));
+    ptr = vectorscan_assume_aligned(ptr, 16);
+    *(m128 *)ptr = a;
+}
+
+// unaligned load
+static really_inline m128 loadu128(const void *ptr) {
+    return _mm_loadu_si128((const m128 *)ptr);
+}
+
+// unaligned store
+static really_inline void storeu128(void *ptr, m128 a) {
+    _mm_storeu_si128 ((m128 *)ptr, a);
+}
+
+// packed unaligned store of first N bytes
+static really_inline
+void storebytes128(void *ptr, m128 a, unsigned int n) {
+    assert(n <= sizeof(a));
+    memcpy(ptr, &a, n);
+}
+
+// packed unaligned load of first N bytes, pad with zero
+static really_inline
+m128 loadbytes128(const void *ptr, unsigned int n) {
+    m128 a = zeroes128();
+    assert(n <= sizeof(a));
+    memcpy(&a, ptr, n);
+    return a;
+}
+
+static really_inline
+m128 mask1bit128(unsigned int n) {
+    assert(n < sizeof(m128) * 8);
+    u32 mask_idx = ((n % 8) * 64) + 95;
+    mask_idx -= n / 8;
+    return loadu128(&simd_onebit_masks[mask_idx]);
+}
+
+// switches on bit N in the given vector.
+static really_inline
+void setbit128(m128 *ptr, unsigned int n) {
+    *ptr = or128(mask1bit128(n), *ptr);
+}
+
+// switches off bit N in the given vector.
+static really_inline
+void clearbit128(m128 *ptr, unsigned int n) {
+    *ptr = andnot128(mask1bit128(n), *ptr);
+}
+
+// tests bit N in the given vector.
+static really_inline
+char testbit128(m128 val, unsigned int n) {
+    const m128 mask = mask1bit128(n);
+#if defined(HAVE_SSE41)
+    return !_mm_testz_si128(mask, val);
+#else
+    return isnonzero128(and128(mask, val));
+#endif
+}
+
+// offset must be an immediate
+#define palignr_imm(r, l, offset) _mm_alignr_epi8(r, l, offset)
+
+static really_inline
+m128 pshufb_m128(m128 a, m128 b) {
+    return _mm_shuffle_epi8(a, b);
+}
+
+#define CASE_ALIGN_VECTORS(a, b, offset)  case offset: return palignr_imm((m128)(a), (m128)(b), (offset)); break;
+
+static really_really_inline
+m128 palignr_sw(m128 r, m128 l, int offset) {
+    switch (offset) {
+    case 0: return l; break;
+    CASE_ALIGN_VECTORS(r, l, 1);
+    CASE_ALIGN_VECTORS(r, l, 2);
+    CASE_ALIGN_VECTORS(r, l, 3);
+    CASE_ALIGN_VECTORS(r, l, 4);
+    CASE_ALIGN_VECTORS(r, l, 5);
+    CASE_ALIGN_VECTORS(r, l, 6);
+    CASE_ALIGN_VECTORS(r, l, 7);
+    CASE_ALIGN_VECTORS(r, l, 8);
+    CASE_ALIGN_VECTORS(r, l, 9);
+    CASE_ALIGN_VECTORS(r, l, 10);
+    CASE_ALIGN_VECTORS(r, l, 11);
+    CASE_ALIGN_VECTORS(r, l, 12);
+    CASE_ALIGN_VECTORS(r, l, 13);
+    CASE_ALIGN_VECTORS(r, l, 14);
+    CASE_ALIGN_VECTORS(r, l, 15);
+    case 16: return r; break;
+    default:
+	    return zeroes128();
+	    break;
+    }
+}
+#undef CASE_ALIGN_VECTORS
+
+static really_really_inline
+m128 palignr(m128 r, m128 l, int offset) {
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(offset)) {
+       return palignr_imm(r, l, offset);
+    }
+#endif
+    return palignr_sw(r, l, offset);
+}
+
+static really_inline
+m128 variable_byte_shift_m128(m128 in, s32 amount) {
+    assert(amount >= -16 && amount <= 16);
+    if (amount < 0) {
+        return palignr(zeroes128(), in, -amount);
+    } else {
+        return palignr(in, zeroes128(), 16 - amount);
+    }
+}
+/*
+static really_inline
+m128 variable_byte_shift_m128(m128 in, s32 amount) {
+    assert(amount >= -16 && amount <= 16);
+    m128 shift_mask = loadu128(vbs_mask_data + 16 - amount);
+    return pshufb_m128(in, shift_mask);
+}*/
+
+static really_inline
+m128 max_u8_m128(m128 a, m128 b) {
+    return _mm_max_epu8(a, b);
+}
+
+static really_inline
+m128 min_u8_m128(m128 a, m128 b) {
+    return _mm_min_epu8(a, b);
+}
+
+static really_inline
+m128 sadd_u8_m128(m128 a, m128 b) {
+    return _mm_adds_epu8(a, b);
+}
+
+static really_inline
+m128 sub_u8_m128(m128 a, m128 b) {
+    return _mm_sub_epi8(a, b);
+}
+
+static really_inline
+m128 set4x32(u32 x3, u32 x2, u32 x1, u32 x0) {
+    return _mm_set_epi32(x3, x2, x1, x0);
+}
+
+static really_inline
+m128 set2x64(u64a hi, u64a lo) {
+    return _mm_set_epi64x(hi, lo);
+}
+
+#endif // ARCH_SIMDE_SIMD_UTILS_H
diff --git a/src/util/supervector/arch/simde/impl.cpp b/src/util/supervector/arch/simde/impl.cpp
new file mode 100644
index 00000000..b1c9b631
--- /dev/null
+++ b/src/util/supervector/arch/simde/impl.cpp
@@ -0,0 +1,530 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2021, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SIMD_IMPL_HPP
+#define SIMD_IMPL_HPP
+
+#include <cstdint>
+#include <cstdio>
+
+#include "ue2common.h"
+#include "util/arch.h"
+#include "util/unaligned.h"
+#include "util/supervector/supervector.hpp"
+
+template<>
+really_inline SuperVector<16>::SuperVector(SuperVector const &other)
+{
+    u.v128[0] = other.u.v128[0];
+}
+
+template<>
+really_inline SuperVector<16>::SuperVector(typename base_type::type const v)
+{
+    u.v128[0] = v;
+};
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(int8_t const other)
+{
+    u.v128[0] = _mm_set1_epi8(other);
+}
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(uint8_t const other)
+{
+    u.v128[0] = _mm_set1_epi8(static_cast<int8_t>(other));
+}
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(int16_t const other)
+{
+    u.v128[0] = _mm_set1_epi16(other);
+}
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(uint16_t const other)
+{
+    u.v128[0] = _mm_set1_epi16(static_cast<int16_t>(other));
+}
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(int32_t const other)
+{
+    u.v128[0] = _mm_set1_epi32(other);
+}
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(uint32_t const other)
+{
+    u.v128[0] = _mm_set1_epi32(static_cast<int32_t>(other));
+}
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(int64_t const other)
+{
+    u.v128[0] = _mm_set1_epi64x(other);
+}
+
+template<>
+template<>
+really_inline SuperVector<16>::SuperVector(uint64_t const other)
+{
+    u.v128[0] = _mm_set1_epi64x(static_cast<int64_t>(other));
+}
+
+// Constants
+template<>
+really_inline SuperVector<16> SuperVector<16>::Ones()
+{
+    return {_mm_set1_epi8(0xFF)};
+}
+
+template<>
+really_inline SuperVector<16> SuperVector<16>::Zeroes(void)
+{
+    return {_mm_set1_epi8(0)};
+}
+
+// Methods
+
+template <>
+really_inline void SuperVector<16>::operator=(SuperVector<16> const &other)
+{
+    u.v128[0] = other.u.v128[0];
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator&(SuperVector<16> const &b) const
+{
+    return {_mm_and_si128(u.v128[0], b.u.v128[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator|(SuperVector<16> const &b) const
+{
+    return {_mm_or_si128(u.v128[0], b.u.v128[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator^(SuperVector<16> const &b) const
+{
+    return {_mm_xor_si128(u.v128[0], b.u.v128[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator!() const
+{
+    return {_mm_xor_si128(u.v128[0], u.v128[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const &b) const
+{
+    return {_mm_andnot_si128(u.v128[0], b.u.v128[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator==(SuperVector<16> const &b) const
+{
+    return {_mm_cmpeq_epi8(u.v128[0], b.u.v128[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator!=(SuperVector<16> const &b) const
+{
+    return !(*this == b);
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator>(SuperVector<16> const &b) const
+{
+    return {_mm_cmpgt_epi8(u.v128[0], b.u.v128[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator<(SuperVector<16> const &b) const
+{
+    return {_mm_cmplt_epi8(u.v128[0], b.u.v128[0])};
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator>=(SuperVector<16> const &b) const
+{
+    return !(*this < b);
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator<=(SuperVector<16> const &b) const
+{
+    return !(*this > b);
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) const
+{
+    return (*this == b);
+}
+
+template <>
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::comparemask(void) const {
+    return (u32)_mm_movemask_epi8(u.v128[0]);
+}
+
+template <>
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::eqmask(SuperVector<16> const b) const {
+    return eq(b).comparemask();
+}
+
+template <> really_inline u32 SuperVector<16>::mask_width() { return 1; }
+
+template <>
+really_inline typename SuperVector<16>::comparemask_type
+SuperVector<16>::iteration_mask(
+    typename SuperVector<16>::comparemask_type mask) {
+    return mask;
+}
+
+// template <>
+// template<uint8_t N>
+// really_inline SuperVector<16> SuperVector<16>::vshl_8_imm() const
+// {
+//     const uint8_t i = N;
+//     return {_mm_slli_epi8(u.v128[0], i)};
+// }
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_16_imm() const
+{
+    return {_mm_slli_epi16(u.v128[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_32_imm() const
+{
+    return {_mm_slli_epi32(u.v128[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_64_imm() const
+{
+    return {_mm_slli_epi64(u.v128[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_128_imm() const
+{
+    return {_mm_slli_si128(u.v128[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshl_imm() const
+{
+    return vshl_128_imm<N>();
+}
+
+// template <>
+// template<uint8_t N>
+// really_inline SuperVector<16> SuperVector<16>::vshr_8_imm() const
+// {
+//     return {_mm_srli_epi8(u.v128[0], N)};
+// }
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_16_imm() const
+{
+    return {_mm_srli_epi16(u.v128[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_32_imm() const
+{
+    return {_mm_srli_epi32(u.v128[0], N)};
+}
+  
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_64_imm() const
+{
+    return {_mm_srli_epi64(u.v128[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_128_imm() const
+{
+    return {_mm_srli_si128(u.v128[0], N)};
+}
+
+template <>
+template<uint8_t N>
+really_inline SuperVector<16> SuperVector<16>::vshr_imm() const
+{
+    return vshr_128_imm<N>();
+}
+
+#if !defined(HS_OPTIMIZE)
+template SuperVector<16> SuperVector<16>::vshl_16_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshl_64_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshl_64_imm<4>() const;
+template SuperVector<16> SuperVector<16>::vshl_128_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshl_128_imm<4>() const;
+template SuperVector<16> SuperVector<16>::vshr_16_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshr_64_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshr_64_imm<4>() const;
+template SuperVector<16> SuperVector<16>::vshr_128_imm<1>() const;
+template SuperVector<16> SuperVector<16>::vshr_128_imm<4>() const;
+#endif
+
+// template <>
+// really_inline SuperVector<16> SuperVector<16>::vshl_8  (uint8_t const N) const
+// {
+//     Unroller<0, 15>::iterator([&,v=this](int i) { if (N == i) return {_mm_slli_epi8(v->u.v128[0], i)}; });
+//     if (N == 16) return Zeroes();
+// }
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const N) const
+{
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(N)) {
+        return {_mm_slli_epi16(u.v128[0], N)};
+    }
+#endif
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_slli_epi16(v->u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_slli_epi32(v->u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_slli_epi64(v->u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_slli_si128(v->u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshl(uint8_t const N) const
+{
+    return vshl_128(N);
+}
+
+// template <>
+// really_inline SuperVector<16> SuperVector<16>::vshr_8  (uint8_t const N) const
+// {
+//     SuperVector<16> result;
+//     Unroller<0, 15>::iterator([&,v=this](uint8_t const i) { if (N == i) result = {_mm_srli_epi8(v->u.v128[0], i)}; });
+//     if (N == 16) result = Zeroes();
+//     return result;
+// }
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_srli_epi16(v->u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_srli_epi32(v->u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_srli_epi64(v->u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const N) const
+{
+    if (N == 0) return *this;
+    if (N == 16) return Zeroes();
+    SuperVector result;
+    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_srli_si128(v->u.v128[0], n)}; });
+    return result;
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::vshr(uint8_t const N) const
+{
+    return vshr_128(N);
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
+{
+    return vshr_128(N);
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
+{
+    return vshl_128(N);
+}
+
+template<>
+really_inline SuperVector<16> SuperVector<16>::Ones_vshr(uint8_t const N)
+{
+    if (N == 0) return Ones();
+    else return Ones().vshr_128(N);
+}
+
+template<>
+really_inline SuperVector<16> SuperVector<16>::Ones_vshl(uint8_t const N)
+{
+    if (N == 0) return Ones();
+    else return Ones().vshr_128(N);
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::loadu(void const *ptr)
+{
+    return _mm_loadu_si128((const m128 *)ptr);
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::load(void const *ptr)
+{
+    assert(ISALIGNED_N(ptr, alignof(SuperVector::size)));
+    ptr = vectorscan_assume_aligned(ptr, SuperVector::size);
+    return _mm_load_si128((const m128 *)ptr);
+}
+
+template <>
+really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len)
+{
+    SuperVector mask = Ones_vshr(16 -len);
+    SuperVector v = _mm_loadu_si128((const m128 *)ptr);
+    return mask & v;
+}
+
+template<>
+really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset)
+{
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(offset)) {
+        if (offset == 16) {
+            return *this;
+        } else {
+            return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], offset)};
+        }
+    }
+#endif
+    switch(offset) {
+    case 0: return other; break;
+    case 1: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 1)}; break;
+    case 2: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 2)}; break;
+    case 3: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 3)}; break;
+    case 4: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 4)}; break;
+    case 5: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 5)}; break;
+    case 6: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 6)}; break;
+    case 7: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 7)}; break;
+    case 8: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 8)}; break;
+    case 9: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 9)}; break;
+    case 10: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 10)}; break;
+    case 11: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 11)}; break;
+    case 12: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 12)}; break;
+    case 13: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 13)}; break;
+    case 14: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 14)}; break;
+    case 15: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 15)}; break;
+    default: break;
+    }
+    return *this;
+}
+
+template<>
+template<>
+really_inline SuperVector<16> SuperVector<16>::pshufb<true>(SuperVector<16> b)
+{
+    return {_mm_shuffle_epi8(u.v128[0], b.u.v128[0])};
+}
+
+template<>
+really_inline SuperVector<16> SuperVector<16>::pshufb_maskz(SuperVector<16> b, uint8_t const len)
+{
+    SuperVector mask = Ones_vshr(16 -len);
+    return mask & pshufb(b);
+}
+
+#endif // SIMD_IMPL_HPP
diff --git a/src/util/supervector/supervector.hpp b/src/util/supervector/supervector.hpp
index c0200575..730a6fd2 100644
--- a/src/util/supervector/supervector.hpp
+++ b/src/util/supervector/supervector.hpp
@@ -388,6 +388,8 @@ struct Unroller<End, End>
 #include "util/supervector/arch/arm/impl.cpp"
 #elif defined(ARCH_PPC64EL)
 #include "util/supervector/arch/ppc64el/impl.cpp"
+#elif defined(SIMDE_BACKEND)
+#include "util/supervector/arch/simde/impl.cpp"
 #endif
 #endif
 

From a8e9b9069e006df2899dd5369b12f2b96b9833be Mon Sep 17 00:00:00 2001
From: Konstantnos Margaritis <konstantinos@vectorcamp.gr>
Date: Tue, 21 Nov 2023 17:13:33 +0000
Subject: [PATCH 06/22] enable SIMDe backend

---
 src/nfa/shufti_simd.hpp           |  2 +-
 src/nfa/truffle_simd.hpp          |  2 +-
 src/nfa/vermicelli_simd.cpp       |  2 +-
 src/util/arch/common/simd_utils.h |  4 ++--
 src/util/bitutils.h               | 25 +++++++++++++++++++++++++
 src/util/intrinsics.h             |  2 --
 src/util/match.hpp                |  2 +-
 src/util/simd_types.h             |  9 +++++++--
 src/util/simd_utils.h             |  2 ++
 9 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/src/nfa/shufti_simd.hpp b/src/nfa/shufti_simd.hpp
index 0f8e2a7b..30df80bf 100644
--- a/src/nfa/shufti_simd.hpp
+++ b/src/nfa/shufti_simd.hpp
@@ -52,7 +52,7 @@ template <uint16_t S>
 static really_inline
 SuperVector<S> blockDoubleMask(SuperVector<S> mask1_lo, SuperVector<S> mask1_hi, SuperVector<S> mask2_lo, SuperVector<S> mask2_hi, SuperVector<S> chars);
 
-#if defined(ARCH_IA32) || defined(ARCH_X86_64)
+#if defined(ARCH_IA32) || defined(ARCH_X86_64) || defined(SIMDE_BACKEND)
 #include "x86/shufti.hpp"
 #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
 #include "arm/shufti.hpp"
diff --git a/src/nfa/truffle_simd.hpp b/src/nfa/truffle_simd.hpp
index e07e92f6..0214833c 100644
--- a/src/nfa/truffle_simd.hpp
+++ b/src/nfa/truffle_simd.hpp
@@ -45,7 +45,7 @@ template <uint16_t S>
 static really_inline
 const SuperVector<S> blockSingleMask(SuperVector<S> shuf_mask_lo_highclear, SuperVector<S> shuf_mask_lo_highset, SuperVector<S> chars);
 
-#if defined(ARCH_IA32) || defined(ARCH_X86_64)
+#if defined(ARCH_IA32) || defined(ARCH_X86_64) || defined(SIMDE_BACKEND)
 #include "x86/truffle.hpp"
 #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
 #include "arm/truffle.hpp"
diff --git a/src/nfa/vermicelli_simd.cpp b/src/nfa/vermicelli_simd.cpp
index a0da0719..c5fbc39a 100644
--- a/src/nfa/vermicelli_simd.cpp
+++ b/src/nfa/vermicelli_simd.cpp
@@ -71,7 +71,7 @@ const u8 *vermicelliDoubleMaskedBlock(SuperVector<S> const data, SuperVector<S>
                                       SuperVector<S> const mask1, SuperVector<S> const mask2,
                                       u8 const c1, u8 const c2, u8 const m1, u8 const m2, u8 const *buf, u16 const len);
 
-#if defined(ARCH_IA32) || defined(ARCH_X86_64)
+#if defined(ARCH_IA32) || defined(ARCH_X86_64) || defined(SIMDE_BACKEND)
 #include "x86/vermicelli.hpp"
 #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
 #include "arm/vermicelli.hpp"
diff --git a/src/util/arch/common/simd_utils.h b/src/util/arch/common/simd_utils.h
index d142ee9a..2542f0f6 100644
--- a/src/util/arch/common/simd_utils.h
+++ b/src/util/arch/common/simd_utils.h
@@ -41,7 +41,7 @@
 
 #include <string.h> // for memcpy
 
-#if !defined(HAVE_SIMD_128_BITS)
+#if !defined(HAVE_SIMD_128_BITS) && !defined(SIMDE_BACKEND)
 #error "You need at least a 128-bit capable SIMD engine!"
 #endif // HAVE_SIMD_128_BITS
 
@@ -88,7 +88,7 @@ static inline void print_m128_2x64(const char *label, m128 vec) {
 #define print_m128_2x64(label, vec) ;
 #endif
 
-#if !defined(ARCH_IA32) && !defined(ARCH_X86_64)
+#if !defined(ARCH_IA32) && !defined(ARCH_X86_64) && !defined(SIMDE_BACKEND)
 #define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0
 #define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0
 #define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8
diff --git a/src/util/bitutils.h b/src/util/bitutils.h
index ffc8f45d..7e006158 100644
--- a/src/util/bitutils.h
+++ b/src/util/bitutils.h
@@ -51,6 +51,31 @@
 #include "util/arch/arm/bitutils.h"
 #elif defined(ARCH_PPC64EL)
 #include "util/arch/ppc64el/bitutils.h"
+#else
+#include "util/arch/common/bitutils.h"
+#define clz32_impl clz32_impl_c
+#define clz64_impl clz64_impl_c
+#define ctz32_impl ctz32_impl_c
+#define ctz64_impl ctz64_impl_c
+#define lg2_impl lg2_impl_c
+#define lg2_64_impl lg2_64_impl_c
+#define findAndClearLSB_32_impl findAndClearLSB_32_impl_c
+#define findAndClearLSB_64_impl findAndClearLSB_64_impl_c
+#define findAndClearMSB_32_impl findAndClearMSB_32_impl_c
+#define findAndClearMSB_64_impl findAndClearMSB_64_impl_c
+#define compress32_impl compress32_impl_c
+#define compress64_impl compress64_impl_c
+#define compress128_impl compress128_impl_c
+#define expand32_impl expand32_impl_c
+#define expand64_impl expand64_impl_c
+#define expand128_impl expand128_impl_c
+#define bf64_iterate_impl bf64_iterate_impl_c
+#define bf64_set_impl bf64_set_impl_c
+#define bf64_unset_impl bf64_unset_impl_c
+#define rank_in_mask32_impl rank_in_mask32_impl_c
+#define rank_in_mask64_impl rank_in_mask64_impl_c
+#define pext32_impl pext32_impl_c
+#define pext64_impl pext64_impl_c
 #endif
 
 static really_inline
diff --git a/src/util/intrinsics.h b/src/util/intrinsics.h
index 08eb6ba6..64f9e9ba 100644
--- a/src/util/intrinsics.h
+++ b/src/util/intrinsics.h
@@ -74,8 +74,6 @@
 #  endif
 #elif defined(USE_PPC64EL_ALTIVEC_H)
 #include <altivec.h>
-#else
-#error no intrinsics file
 #endif
 
 #endif // INTRINSICS_H
diff --git a/src/util/match.hpp b/src/util/match.hpp
index 003c665f..68497349 100644
--- a/src/util/match.hpp
+++ b/src/util/match.hpp
@@ -49,7 +49,7 @@ const u8 *first_zero_match_inverted(const u8 *buf, SuperVector<S> v, u16 const l
 template <u16 S>
 const u8 *last_zero_match_inverted(const u8 *buf, SuperVector<S> v, u16 len = S);
 
-#if defined(ARCH_IA32) || defined(ARCH_X86_64)
+#if defined(ARCH_IA32) || defined(ARCH_X86_64) || defined(SIMDE_BACKEND)
 #include "util/arch/x86/match.hpp"
 #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
 #include "util/arch/arm/match.hpp"
diff --git a/src/util/simd_types.h b/src/util/simd_types.h
index 4f0fd1a9..b9e2a492 100644
--- a/src/util/simd_types.h
+++ b/src/util/simd_types.h
@@ -42,8 +42,13 @@
 #include "util/arch/ppc64el/simd_types.h"
 #endif
 
-#if !defined(m128) && !defined(HAVE_SIMD_128_BITS)
-typedef struct ALIGN_DIRECTIVE {u64a hi; u64a lo;} m128;
+#if defined(SIMDE_BACKEND)
+#define VECTORSIZE 16
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#define SIMDE_NO_NATIVE
+#include "simde/simde/x86/sse4.2.h"
+typedef simde__m128i m128;
+#define HAVE_SIMD_128_BITS
 #endif
 
 #if !defined(m256) && !defined(HAVE_SIMD_256_BITS)
diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h
index 2f0012c6..0ed66177 100644
--- a/src/util/simd_utils.h
+++ b/src/util/simd_utils.h
@@ -67,6 +67,8 @@ extern const char vbs_mask_data[];
 #include "util/arch/arm/simd_utils.h"
 #elif defined(ARCH_PPC64EL)
 #include "util/arch/ppc64el/simd_utils.h"
+#elif defined(SIMDE_BACKEND)
+#include "util/arch/simde/simd_utils.h"
 #endif
 
 #include "util/arch/common/simd_utils.h"

From 14c9222a48eafca353ef925ba802033e0726561b Mon Sep 17 00:00:00 2001
From: Konstantnos Margaritis <konstantinos@vectorcamp.gr>
Date: Tue, 21 Nov 2023 17:13:54 +0000
Subject: [PATCH 07/22] add generic tune flags

---
 cmake/archdetect.cmake | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cmake/archdetect.cmake b/cmake/archdetect.cmake
index 015140fe..387437eb 100644
--- a/cmake/archdetect.cmake
+++ b/cmake/archdetect.cmake
@@ -84,8 +84,11 @@ else()
     elseif(ARCH_ARM32)
        set(GNUCC_ARCH armv7a)
        set(TUNE_FLAG generic)
-    else()
+    elseif(ARCH_PPC64EL)
        set(GNUCC_ARCH power9)
        set(TUNE_FLAG power9)
+    else()
+       set(GNUCC_ARCH native)
+       set(TUNE_FLAG native)
     endif()
 endif()

From 7c53b4e608bd6166d003007b3aa7e0dccff434fc Mon Sep 17 00:00:00 2001
From: Konstantnos Margaritis <konstantinos@vectorcamp.gr>
Date: Tue, 21 Nov 2023 17:14:21 +0000
Subject: [PATCH 08/22] add include dirs

---
 benchmarks/CMakeLists.txt | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 90c685c4..63391a68 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -1,4 +1,7 @@
-if (NOT FAT_RUNTIME AND (BUILD_STATIC_AND_SHARED OR BUILD_STATIC_LIBS))
+include_directories(SYSTEM ${CMAKE_CURRENT_SOURCE_DIR})
+include_directories(${PROJECT_SOURCE_DIR})
+
+if (NOT FAT_RUNTIME AND (BUILD_SHARED_LIBS OR BUILD_STATIC_LIBS))
   add_executable(benchmarks benchmarks.cpp)
   set_source_files_properties(benchmarks.cpp PROPERTIES COMPILE_FLAGS
       "-Wall -Wno-unused-variable")

From b32ca719d9da787db3cd3d278bc8bb1099ffd819 Mon Sep 17 00:00:00 2001
From: Konstantnos Margaritis <konstantinos@vectorcamp.gr>
Date: Thu, 23 Nov 2023 13:07:28 +0000
Subject: [PATCH 09/22] SIMDE is a valid platform

---
 src/hs_valid_platform.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/hs_valid_platform.c b/src/hs_valid_platform.c
index 0af36b6c..067a05e6 100644
--- a/src/hs_valid_platform.c
+++ b/src/hs_valid_platform.c
@@ -50,7 +50,7 @@ hs_error_t HS_CDECL hs_valid_platform(void) {
     } else {
         return HS_ARCH_ERROR;
     }
-#elif defined(ARCH_PPC64EL)
-    return HS_SUCCESS;    
+#elif defined(ARCH_PPC64EL) || defined(SIMDE_BACKEND)
+    return HS_SUCCESS;
 #endif
 }

From 62cb8d6c2d3f1927dcb5aaf3a6be71ebed00c50b Mon Sep 17 00:00:00 2001
From: Konstantnos Margaritis <konstantinos@vectorcamp.gr>
Date: Thu, 23 Nov 2023 16:07:58 +0000
Subject: [PATCH 10/22] fix test for SIMDe

---
 unit/internal/simd_utils.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp
index c57cd598..a9737bd2 100644
--- a/unit/internal/simd_utils.cpp
+++ b/unit/internal/simd_utils.cpp
@@ -667,7 +667,7 @@ TEST(SimdUtilsTest, movq) {
     ASSERT_EQ(0, memcmp(cmp, &r, sizeof(r)));
 
 #if defined(HAVE_SIMD_128_BITS)
-#if defined(ARCH_IA32) || defined(ARCH_X86_64)
+#if defined(ARCH_IA32) || defined(ARCH_X86_64) || defined(SIMDE_BACKEND)
     simd = _mm_set_epi64x(~0LL, 0x123456789abcdef);
 #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
     int64x2_t a = { 0x123456789abcdefLL, ~0LL };

From 20f4f542a5b1d188cc4614d98c3a3b3234954ef3 Mon Sep 17 00:00:00 2001
From: Konstantnos Margaritis <konstantinos@vectorcamp.gr>
Date: Thu, 23 Nov 2023 16:08:26 +0000
Subject: [PATCH 11/22] add missing intrinsics for SIMDe backend

---
 src/util/arch/simde/simd_utils.h | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/util/arch/simde/simd_utils.h b/src/util/arch/simde/simd_utils.h
index d241f87c..b8e7d4a8 100644
--- a/src/util/arch/simde/simd_utils.h
+++ b/src/util/arch/simde/simd_utils.h
@@ -99,14 +99,25 @@ static really_inline u32 diffrich64_128(m128 a, m128 b) {
     return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0x5;
 }
 
+static really_really_inline
+m128 add_2x64(m128 a, m128 b) {
+    return (m128) _mm_add_epi64(a, b);
+}
+
+static really_really_inline
+m128 sub_2x64(m128 a, m128 b) {
+    return (m128) _mm_sub_epi64(a, b);
+}
+
 static really_really_inline
 m128 lshift64_m128(m128 a, unsigned b) {
     return _mm_slli_epi64(a, b);
 }
 
 #define rshift64_m128(a, b) _mm_srli_epi64((a), (b))
-#define eq128(a, b)      _mm_cmpeq_epi8((a), (b))
-#define movemask128(a)  ((u32)_mm_movemask_epi8((a)))
+#define eq128(a, b)         _mm_cmpeq_epi8((a), (b))
+#define eq64_m128(a, b)     _mm_cmpeq_epi64((a), (b))
+#define movemask128(a)      ((u32)_mm_movemask_epi8((a)))
 
 static really_inline m128 set1_16x8(u8 c) {
     return _mm_set1_epi8(c);

From dfacf758559208b4ed93551d0d0d1659bad3bd5b Mon Sep 17 00:00:00 2001
From: Konstantnos Margaritis <konstantinos@vectorcamp.gr>
Date: Thu, 23 Nov 2023 16:09:10 +0000
Subject: [PATCH 12/22] existing scalar implementations were incorrect -but
 never tested, ported from arm/ppc64le

---
 src/util/arch/common/bitutils.h | 48 ++++++++++++++++++++-------------
 1 file changed, 29 insertions(+), 19 deletions(-)

diff --git a/src/util/arch/common/bitutils.h b/src/util/arch/common/bitutils.h
index e5ff5bc1..e5ab0d05 100644
--- a/src/util/arch/common/bitutils.h
+++ b/src/util/arch/common/bitutils.h
@@ -214,16 +214,22 @@ u64a compress64_impl_c(u64a x, u64a m) {
 }
 
 static really_inline
-m128 compress128_impl_c(m128 xvec, m128 mvec) {
-    u64a ALIGN_ATTR(16) x[2];
-    u64a ALIGN_ATTR(16) m[2];
-    store128(x, xvec);
-    store128(m, mvec);
+m128 compress128_impl_c(m128 x, m128 m) {
+    m128 one = set1_2x64(1);
+    m128 bitset = one;
+    m128 vres = zeroes128();
+    while (isnonzero128(m)) {
+	m128 mm = sub_2x64(zeroes128(), m);
+	m128 tv = and128(x, m);
+	tv = and128(tv, mm);
 
-    compress64_impl_c(x[0], m[0]);
-    compress64_impl_c(x[1], m[1]);
-
-    return xvec;
+	m128 mask = not128(eq64_m128(tv, zeroes128()));
+	mask = and128(bitset, mask);
+        vres = or128(vres, mask);
+	m = and128(m, sub_2x64(m, one));
+        bitset = lshift64_m128(bitset, 1);
+    }
+    return vres;
 }
 
 static really_inline
@@ -303,16 +309,20 @@ u64a expand64_impl_c(u64a x, u64a m) {
 }
 
 static really_inline
-m128 expand128_impl_c(m128 xvec, m128 mvec) {
-    u64a ALIGN_ATTR(16) x[2];
-    u64a ALIGN_ATTR(16) m[2];
-    store128(x, xvec);
-    store128(m, mvec);
-
-    expand64_impl_c(x[0], m[0]);
-    expand64_impl_c(x[1], m[1]);
-
-    return xvec;
+m128 expand128_impl_c(m128 x, m128 m) {
+    m128 one = set1_2x64(1);
+    m128 bb = one;
+    m128 res = zeroes128();
+    while (isnonzero128(m)) {
+	m128 xm = and128(x, bb);
+        m128 mm = sub_2x64(zeroes128(), m);
+        m128 mask = not128(eq64_m128(xm, zeroes128()));
+	mask = and128(mask, and128(m,mm));
+        res = or128(res, mask);
+        m = and128(m, sub_2x64(m, one));
+        bb = lshift64_m128(bb, 1);
+    }
+    return res;
 }
 
 /* returns the first set bit after begin (if not ~0U). If no bit is set after

From f57928ea08c23c960313e35fc89d77a583031102 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konma@vectorcamp.gr>
Date: Mon, 27 Nov 2023 12:21:58 +0000
Subject: [PATCH 13/22] fix SIMDe emulation builds on Arm, add native
 translation from x86 for comparison

---
 CMakeLists.txt                           |  32 +-
 cmake/archdetect.cmake                   |   9 +-
 cmake/simde.cmake                        |  10 +-
 src/hs_valid_platform.c                  |   9 +-
 src/nfa/shufti_simd.hpp                  |  10 +-
 src/nfa/truffle_simd.hpp                 |   8 +-
 src/nfa/vermicelli_simd.cpp              |   8 +-
 src/util/arch/common/simd_utils.h        |   6 +-
 src/util/arch/simde/simd_utils.h         | 388 -----------------
 src/util/arch/x86/simd_utils.h           |  33 +-
 src/util/bitutils.h                      |   4 +-
 src/util/match.hpp                       |   8 +-
 src/util/simd_types.h                    |  20 +-
 src/util/simd_utils.h                    |   7 +-
 src/util/supervector/arch/simde/impl.cpp | 530 -----------------------
 src/util/supervector/supervector.hpp     |  14 +-
 unit/internal/simd_utils.cpp             |   3 +-
 17 files changed, 106 insertions(+), 993 deletions(-)
 delete mode 100644 src/util/arch/simde/simd_utils.h
 delete mode 100644 src/util/supervector/arch/simde/impl.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 908b53fc..7ca7b994 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -119,7 +119,10 @@ endif()
 # Detect OS and if Fat Runtime is available
 include (${CMAKE_MODULE_PATH}/osdetection.cmake)
 
-if (ARCH_IA32 OR ARCH_X86_64)
+if(SIMDE_BACKEND)
+    include (${CMAKE_MODULE_PATH}/simde.cmake)
+    set(ARCH_FLAG march)
+elseif (ARCH_IA32 OR ARCH_X86_64)
     include (${CMAKE_MODULE_PATH}/cflags-x86.cmake)
     set(ARCH_FLAG march)
 elseif (ARCH_ARM32 OR ARCH_AARCH64)
@@ -128,10 +131,6 @@ elseif (ARCH_ARM32 OR ARCH_AARCH64)
 elseif (ARCH_PPC64EL)
     include (${CMAKE_MODULE_PATH}/cflags-ppc64le.cmake)
     set(ARCH_FLAG mcpu)
-elseif(SIMDE_BACKEND)
-    include (${CMAKE_MODULE_PATH}/simde.cmake)
-    set(ARCH_FLAG march)
-else()
     message(FATAL_ERROR "Unsupported platform")
 endif ()
 
@@ -243,8 +242,11 @@ set (hs_exec_common_SRCS
     src/util/arch/common/cpuid_flags.h
     src/util/multibit.c
     )
-
-if (ARCH_IA32 OR ARCH_X86_64)
+if (SIMDE_BACKEND)
+set (hs_exec_common_SRCS
+    ${hs_exec_common_SRCS}
+    src/util/arch/simde/cpuid_flags.c)
+elseif (ARCH_IA32 OR ARCH_X86_64)
 set (hs_exec_common_SRCS
     ${hs_exec_common_SRCS}
     src/util/arch/x86/cpuid_flags.c
@@ -258,10 +260,6 @@ elseif (ARCH_PPC64EL)
 set (hs_exec_common_SRCS
     ${hs_exec_common_SRCS}
     src/util/arch/ppc64el/cpuid_flags.c)
-elseif (SIMDE_BACKEND)
-set (hs_exec_common_SRCS
-    ${hs_exec_common_SRCS}
-    src/util/arch/simde/cpuid_flags.c)
 endif ()
 
 set (hs_exec_SRCS
@@ -406,7 +404,12 @@ set (hs_exec_SRCS
     src/database.h
 )
 
-if (ARCH_IA32 OR ARCH_X86_64)
+if (SIMDE_BACKEND)
+set (hs_exec_SRCS
+    ${hs_exec_SRCS}
+    src/nfa/vermicelli_simd.cpp
+    src/util/supervector/arch/x86/impl.cpp)
+elseif (ARCH_IA32 OR ARCH_X86_64)
 set (hs_exec_SRCS
     ${hs_exec_SRCS}
     src/nfa/vermicelli_simd.cpp
@@ -420,11 +423,6 @@ set (hs_exec_SRCS
     ${hs_exec_SRCS}
     src/nfa/vermicelli_simd.cpp
     src/util/supervector/arch/ppc64el/impl.cpp)
-elseif (SIMDE_BACKEND)
-set (hs_exec_SRCS
-    ${hs_exec_SRCS}
-    src/nfa/vermicelli_simd.cpp
-    src/util/supervector/arch/simde/impl.cpp)
 endif()
 
 if (ARCH_IA32 OR ARCH_X86_64)
diff --git a/cmake/archdetect.cmake b/cmake/archdetect.cmake
index 387437eb..87c4c4e7 100644
--- a/cmake/archdetect.cmake
+++ b/cmake/archdetect.cmake
@@ -67,7 +67,10 @@ if (USE_CPU_NATIVE)
         message(STATUS "clang will tune for ${GNUCC_ARCH}, ${TUNE_FLAG}")
     endif()
 else()
-    if (ARCH_IA32 OR ARCH_X86_64)
+    if (SIMDE_BACKEND)
+        set(GNUCC_ARCH native)
+        set(TUNE_FLAG generic)
+    elseif (ARCH_IA32 OR ARCH_X86_64)
         set(GNUCC_ARCH native)
         set(TUNE_FLAG generic)
     elseif(ARCH_AARCH64)
@@ -85,8 +88,8 @@ else()
        set(GNUCC_ARCH armv7a)
        set(TUNE_FLAG generic)
     elseif(ARCH_PPC64EL)
-       set(GNUCC_ARCH power9)
-       set(TUNE_FLAG power9)
+       set(GNUCC_ARCH power8)
+       set(TUNE_FLAG power8)
     else()
        set(GNUCC_ARCH native)
        set(TUNE_FLAG native)
diff --git a/cmake/simde.cmake b/cmake/simde.cmake
index b68c8e57..12c56c6d 100644
--- a/cmake/simde.cmake
+++ b/cmake/simde.cmake
@@ -1,5 +1,9 @@
-include_directories(${PROJECT_SOURCE_DIR}/simde/simde)
+# include_directories(${PROJECT_SOURCE_DIR}/simde/simde)
 
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DSIMDE_BACKEND")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSIMDE_BACKEND")
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DVS_SIMDE_BACKEND")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DVS_SIMDE_BACKEND")
 
+if (SIMDE_NATIVE)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DVS_SIMDE_NATIVE -DSIMDE_ENABLE_OPENMP -fopenmp-simd")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DVS_SIMDE_NATIVE -DSIMDE_ENABLE_OPENMP -fopenmp-simd")
+endif()
diff --git a/src/hs_valid_platform.c b/src/hs_valid_platform.c
index 067a05e6..74a8fc1e 100644
--- a/src/hs_valid_platform.c
+++ b/src/hs_valid_platform.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2016-2017, Intel Corporation
+ * Copyright (c) 2020-2023, VectorCamp PC
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -29,28 +30,30 @@
 #include "config.h"
 #include "hs_common.h"
 #include "ue2common.h"
+#if !defined(VS_SIMDE_BACKEND)
 #if defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "util/arch/x86/cpuid_inline.h"
 #elif defined(ARCH_AARCH64)
 #include "util/arch/arm/cpuid_inline.h"
 #endif
+#endif
 
 HS_PUBLIC_API
 hs_error_t HS_CDECL hs_valid_platform(void) {
     /* Hyperscan requires SSSE3, anything else is a bonus */
-#if defined(ARCH_IA32) || defined(ARCH_X86_64)
+#if !defined(VS_SIMDE_BACKEND) && (defined(ARCH_IA32) || defined(ARCH_X86_64))
     if (check_ssse3()) {
         return HS_SUCCESS;
     } else {
         return HS_ARCH_ERROR;
     }
-#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
+#elif !defined(VS_SIMDE_BACKEND) && (defined(ARCH_ARM32) || defined(ARCH_AARCH64))
    if (check_neon()) {
         return HS_SUCCESS;
     } else {
         return HS_ARCH_ERROR;
     }
-#elif defined(ARCH_PPC64EL) || defined(SIMDE_BACKEND)
+#elif defined(ARCH_PPC64EL) || defined(VS_SIMDE_BACKEND)
     return HS_SUCCESS;
 #endif
 }
diff --git a/src/nfa/shufti_simd.hpp b/src/nfa/shufti_simd.hpp
index 30df80bf..feeb54ab 100644
--- a/src/nfa/shufti_simd.hpp
+++ b/src/nfa/shufti_simd.hpp
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2015-2017, Intel Corporation
- * Copyright (c) 2020-2021, VectorCamp PC
+ * Copyright (c) 2020-2023, VectorCamp PC
  * Copyright (c) 2021, Arm Limited
  *
  * Redistribution and use in source and binary forms, with or without
@@ -52,13 +52,17 @@ template <uint16_t S>
 static really_inline
 SuperVector<S> blockDoubleMask(SuperVector<S> mask1_lo, SuperVector<S> mask1_hi, SuperVector<S> mask2_lo, SuperVector<S> mask2_hi, SuperVector<S> chars);
 
-#if defined(ARCH_IA32) || defined(ARCH_X86_64) || defined(SIMDE_BACKEND)
+#if defined(VS_SIMDE_BACKEND)
 #include "x86/shufti.hpp"
-#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
+#else
+#if defined(ARCH_IA32) || defined(ARCH_X86_64)
+#include "x86/shufti.hpp"
+#elif (defined(ARCH_ARM32) || defined(ARCH_AARCH64))
 #include "arm/shufti.hpp"
 #elif defined(ARCH_PPC64EL)
 #include "ppc64el/shufti.hpp"
 #endif
+#endif
 
 template <uint16_t S>
 static really_inline
diff --git a/src/nfa/truffle_simd.hpp b/src/nfa/truffle_simd.hpp
index 0214833c..c1028156 100644
--- a/src/nfa/truffle_simd.hpp
+++ b/src/nfa/truffle_simd.hpp
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2015-2017, Intel Corporation
- * Copyright (c) 2020-2021, VectorCamp PC
+ * Copyright (c) 2020-2023, VectorCamp PC
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -45,13 +45,17 @@ template <uint16_t S>
 static really_inline
 const SuperVector<S> blockSingleMask(SuperVector<S> shuf_mask_lo_highclear, SuperVector<S> shuf_mask_lo_highset, SuperVector<S> chars);
 
-#if defined(ARCH_IA32) || defined(ARCH_X86_64) || defined(SIMDE_BACKEND)
+#if defined(VS_SIMDE_BACKEND)
+#include "x86/truffle.hpp"
+#else
+#if defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "x86/truffle.hpp"
 #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
 #include "arm/truffle.hpp"
 #elif defined(ARCH_PPC64EL)
 #include "ppc64el/truffle.hpp"
 #endif
+#endif
 
 template <uint16_t S>
 static really_inline
diff --git a/src/nfa/vermicelli_simd.cpp b/src/nfa/vermicelli_simd.cpp
index c5fbc39a..67ac1dac 100644
--- a/src/nfa/vermicelli_simd.cpp
+++ b/src/nfa/vermicelli_simd.cpp
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2015-2020, Intel Corporation
- * Copyright (c) 2020-2021, VectorCamp PC
+ * Copyright (c) 2020-2023, VectorCamp PC
  * Copyright (c) 2021, Arm Limited
  *
  * Redistribution and use in source and binary forms, with or without
@@ -71,13 +71,17 @@ const u8 *vermicelliDoubleMaskedBlock(SuperVector<S> const data, SuperVector<S>
                                       SuperVector<S> const mask1, SuperVector<S> const mask2,
                                       u8 const c1, u8 const c2, u8 const m1, u8 const m2, u8 const *buf, u16 const len);
 
-#if defined(ARCH_IA32) || defined(ARCH_X86_64) || defined(SIMDE_BACKEND)
+#if defined(VS_SIMDE_BACKEND)
+#include "x86/vermicelli.hpp"
+#else
+#if defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "x86/vermicelli.hpp"
 #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
 #include "arm/vermicelli.hpp"
 #elif defined(ARCH_PPC64EL)
 #include "ppc64el/vermicelli.hpp"
 #endif
+#endif
 
 template <uint16_t S>
 static const u8 *vermicelliExecReal(SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u8 const *buf_end) {
diff --git a/src/util/arch/common/simd_utils.h b/src/util/arch/common/simd_utils.h
index 2542f0f6..89190648 100644
--- a/src/util/arch/common/simd_utils.h
+++ b/src/util/arch/common/simd_utils.h
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2015-2020, Intel Corporation
- * Copyright (c) 2020-2021, VectorCamp PC
+ * Copyright (c) 2020-2023, VectorCamp PC
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -41,7 +41,7 @@
 
 #include <string.h> // for memcpy
 
-#if !defined(HAVE_SIMD_128_BITS) && !defined(SIMDE_BACKEND)
+#if !defined(HAVE_SIMD_128_BITS) && !defined(VS_SIMDE_BACKEND)
 #error "You need at least a 128-bit capable SIMD engine!"
 #endif // HAVE_SIMD_128_BITS
 
@@ -88,7 +88,7 @@ static inline void print_m128_2x64(const char *label, m128 vec) {
 #define print_m128_2x64(label, vec) ;
 #endif
 
-#if !defined(ARCH_IA32) && !defined(ARCH_X86_64) && !defined(SIMDE_BACKEND)
+#if !defined(ARCH_IA32) && !defined(ARCH_X86_64) && !defined(VS_SIMDE_BACKEND)
 #define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0
 #define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0
 #define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8
diff --git a/src/util/arch/simde/simd_utils.h b/src/util/arch/simde/simd_utils.h
deleted file mode 100644
index b8e7d4a8..00000000
--- a/src/util/arch/simde/simd_utils.h
+++ /dev/null
@@ -1,388 +0,0 @@
-/*
- * Copyright (c) 2015-2020, Intel Corporation
- * Copyright (c) 2020-2021, VectorCamp PC
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-/** \file
- * \brief SIMD types and primitive operations.
- */
-
-#ifndef ARCH_SIMDE_SIMD_UTILS_H
-#define ARCH_SIMDE_SIMD_UTILS_H
-
-#include "ue2common.h"
-#include "util/simd_types.h"
-#include "util/unaligned.h"
-#include "util/intrinsics.h"
-
-#include <string.h> // for memcpy
-
-#define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0
-#define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0
-#define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8
-
-/** \brief LUT for the mask1bit functions. */
-ALIGN_CL_DIRECTIVE static const u8 simd_onebit_masks[] = {
-    ZEROES_32, ZEROES_32,
-    ZEROES_31, 0x01, ZEROES_32,
-    ZEROES_31, 0x02, ZEROES_32,
-    ZEROES_31, 0x04, ZEROES_32,
-    ZEROES_31, 0x08, ZEROES_32,
-    ZEROES_31, 0x10, ZEROES_32,
-    ZEROES_31, 0x20, ZEROES_32,
-    ZEROES_31, 0x40, ZEROES_32,
-    ZEROES_31, 0x80, ZEROES_32,
-    ZEROES_32, ZEROES_32,
-};
-
-static really_inline m128 ones128(void) {
-    return (m128) _mm_set1_epi8(0xFF);
-}
-
-static really_inline m128 zeroes128(void) {
-    return (m128) _mm_setzero_si128();
-}
-
-/** \brief Bitwise not for m128*/
-static really_inline m128 not128(m128 a) {
-    return (m128) _mm_xor_si128(a, ones128());
-}
-
-/** \brief Return 1 if a and b are different otherwise 0 */
-static really_inline int diff128(m128 a, m128 b) {
-    return (_mm_movemask_epi8(_mm_cmpeq_epi8(a, b)) ^ 0xffff);
-}
-
-static really_inline int isnonzero128(m128 a) {
-    return !!diff128(a, zeroes128());
-}
-
-/**
- * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit
- * mask indicating which 32-bit words contain differences.
- */
-static really_inline u32 diffrich128(m128 a, m128 b) {
-    a = _mm_cmpeq_epi32(a, b);
-    return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0xf;
-}
-
-/**
- * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and
- * returns a 4-bit mask indicating which 64-bit words contain differences.
- */
-static really_inline u32 diffrich64_128(m128 a, m128 b) {
-    a = _mm_cmpeq_epi64(a, b);
-    return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0x5;
-}
-
-static really_really_inline
-m128 add_2x64(m128 a, m128 b) {
-    return (m128) _mm_add_epi64(a, b);
-}
-
-static really_really_inline
-m128 sub_2x64(m128 a, m128 b) {
-    return (m128) _mm_sub_epi64(a, b);
-}
-
-static really_really_inline
-m128 lshift64_m128(m128 a, unsigned b) {
-    return _mm_slli_epi64(a, b);
-}
-
-#define rshift64_m128(a, b) _mm_srli_epi64((a), (b))
-#define eq128(a, b)         _mm_cmpeq_epi8((a), (b))
-#define eq64_m128(a, b)     _mm_cmpeq_epi64((a), (b))
-#define movemask128(a)      ((u32)_mm_movemask_epi8((a)))
-
-static really_inline m128 set1_16x8(u8 c) {
-    return _mm_set1_epi8(c);
-}
-
-static really_inline m128 set1_4x32(u32 c) {
-    return _mm_set1_epi32(c);
-}
-
-static really_inline m128 set1_2x64(u64a c) {
-    return _mm_set1_epi64x(c);
-}
-
-static really_inline u32 movd(const m128 in) {
-    return _mm_cvtsi128_si32(in);
-}
-
-static really_inline u64a movq(const m128 in) {
-    return _mm_cvtsi128_si64(in);
-}
-
-/* another form of movq */
-static really_inline
-m128 load_m128_from_u64a(const u64a *p) {
-    return _mm_set_epi64x(0LL, *p);
-}
-
-#define CASE_RSHIFT_VECTOR(a, count)  case count: return _mm_srli_si128((m128)(a), (count)); break;
-
-static really_inline
-m128 rshiftbyte_m128(const m128 a, int count_immed) {
-    switch (count_immed) {
-    case 0: return a; break;
-    CASE_RSHIFT_VECTOR(a, 1);
-    CASE_RSHIFT_VECTOR(a, 2);
-    CASE_RSHIFT_VECTOR(a, 3);
-    CASE_RSHIFT_VECTOR(a, 4);
-    CASE_RSHIFT_VECTOR(a, 5);
-    CASE_RSHIFT_VECTOR(a, 6);
-    CASE_RSHIFT_VECTOR(a, 7);
-    CASE_RSHIFT_VECTOR(a, 8);
-    CASE_RSHIFT_VECTOR(a, 9);
-    CASE_RSHIFT_VECTOR(a, 10);
-    CASE_RSHIFT_VECTOR(a, 11);
-    CASE_RSHIFT_VECTOR(a, 12);
-    CASE_RSHIFT_VECTOR(a, 13);
-    CASE_RSHIFT_VECTOR(a, 14);
-    CASE_RSHIFT_VECTOR(a, 15);
-    default: return zeroes128(); break;
-    }
-}
-#undef CASE_RSHIFT_VECTOR
-
-#define CASE_LSHIFT_VECTOR(a, count)  case count: return _mm_slli_si128((m128)(a), (count)); break;
-
-static really_inline
-m128 lshiftbyte_m128(const m128 a, int count_immed) {
-    switch (count_immed) {
-    case 0: return a; break;
-    CASE_LSHIFT_VECTOR(a, 1);
-    CASE_LSHIFT_VECTOR(a, 2);
-    CASE_LSHIFT_VECTOR(a, 3);
-    CASE_LSHIFT_VECTOR(a, 4);
-    CASE_LSHIFT_VECTOR(a, 5);
-    CASE_LSHIFT_VECTOR(a, 6);
-    CASE_LSHIFT_VECTOR(a, 7);
-    CASE_LSHIFT_VECTOR(a, 8);
-    CASE_LSHIFT_VECTOR(a, 9);
-    CASE_LSHIFT_VECTOR(a, 10);
-    CASE_LSHIFT_VECTOR(a, 11);
-    CASE_LSHIFT_VECTOR(a, 12);
-    CASE_LSHIFT_VECTOR(a, 13);
-    CASE_LSHIFT_VECTOR(a, 14);
-    CASE_LSHIFT_VECTOR(a, 15);
-    default: return zeroes128(); break;
-    }
-}
-#undef CASE_LSHIFT_VECTOR
-
-#define extract32from128(a, imm) _mm_extract_epi32(a, imm)
-#define extract64from128(a, imm) _mm_extract_epi64(a, imm)
-
-static really_inline m128 add128(m128 a, m128 b) {
-    return _mm_add_epi64(a, b);
-}
-
-static really_inline m128 and128(m128 a, m128 b) {
-    return _mm_and_si128(a,b);
-}
-
-static really_inline m128 xor128(m128 a, m128 b) {
-    return _mm_xor_si128(a,b);
-}
-
-static really_inline m128 or128(m128 a, m128 b) {
-    return _mm_or_si128(a,b);
-}
-
-static really_inline m128 andnot128(m128 a, m128 b) {
-    return _mm_andnot_si128(a, b);
-}
-
-// aligned load
-static really_inline m128 load128(const void *ptr) {
-    assert(ISALIGNED_N(ptr, alignof(m128)));
-    ptr = vectorscan_assume_aligned(ptr, 16);
-    return _mm_load_si128((const m128 *)ptr);
-}
-
-// aligned store
-static really_inline void store128(void *ptr, m128 a) {
-    assert(ISALIGNED_N(ptr, alignof(m128)));
-    ptr = vectorscan_assume_aligned(ptr, 16);
-    *(m128 *)ptr = a;
-}
-
-// unaligned load
-static really_inline m128 loadu128(const void *ptr) {
-    return _mm_loadu_si128((const m128 *)ptr);
-}
-
-// unaligned store
-static really_inline void storeu128(void *ptr, m128 a) {
-    _mm_storeu_si128 ((m128 *)ptr, a);
-}
-
-// packed unaligned store of first N bytes
-static really_inline
-void storebytes128(void *ptr, m128 a, unsigned int n) {
-    assert(n <= sizeof(a));
-    memcpy(ptr, &a, n);
-}
-
-// packed unaligned load of first N bytes, pad with zero
-static really_inline
-m128 loadbytes128(const void *ptr, unsigned int n) {
-    m128 a = zeroes128();
-    assert(n <= sizeof(a));
-    memcpy(&a, ptr, n);
-    return a;
-}
-
-static really_inline
-m128 mask1bit128(unsigned int n) {
-    assert(n < sizeof(m128) * 8);
-    u32 mask_idx = ((n % 8) * 64) + 95;
-    mask_idx -= n / 8;
-    return loadu128(&simd_onebit_masks[mask_idx]);
-}
-
-// switches on bit N in the given vector.
-static really_inline
-void setbit128(m128 *ptr, unsigned int n) {
-    *ptr = or128(mask1bit128(n), *ptr);
-}
-
-// switches off bit N in the given vector.
-static really_inline
-void clearbit128(m128 *ptr, unsigned int n) {
-    *ptr = andnot128(mask1bit128(n), *ptr);
-}
-
-// tests bit N in the given vector.
-static really_inline
-char testbit128(m128 val, unsigned int n) {
-    const m128 mask = mask1bit128(n);
-#if defined(HAVE_SSE41)
-    return !_mm_testz_si128(mask, val);
-#else
-    return isnonzero128(and128(mask, val));
-#endif
-}
-
-// offset must be an immediate
-#define palignr_imm(r, l, offset) _mm_alignr_epi8(r, l, offset)
-
-static really_inline
-m128 pshufb_m128(m128 a, m128 b) {
-    return _mm_shuffle_epi8(a, b);
-}
-
-#define CASE_ALIGN_VECTORS(a, b, offset)  case offset: return palignr_imm((m128)(a), (m128)(b), (offset)); break;
-
-static really_really_inline
-m128 palignr_sw(m128 r, m128 l, int offset) {
-    switch (offset) {
-    case 0: return l; break;
-    CASE_ALIGN_VECTORS(r, l, 1);
-    CASE_ALIGN_VECTORS(r, l, 2);
-    CASE_ALIGN_VECTORS(r, l, 3);
-    CASE_ALIGN_VECTORS(r, l, 4);
-    CASE_ALIGN_VECTORS(r, l, 5);
-    CASE_ALIGN_VECTORS(r, l, 6);
-    CASE_ALIGN_VECTORS(r, l, 7);
-    CASE_ALIGN_VECTORS(r, l, 8);
-    CASE_ALIGN_VECTORS(r, l, 9);
-    CASE_ALIGN_VECTORS(r, l, 10);
-    CASE_ALIGN_VECTORS(r, l, 11);
-    CASE_ALIGN_VECTORS(r, l, 12);
-    CASE_ALIGN_VECTORS(r, l, 13);
-    CASE_ALIGN_VECTORS(r, l, 14);
-    CASE_ALIGN_VECTORS(r, l, 15);
-    case 16: return r; break;
-    default:
-	    return zeroes128();
-	    break;
-    }
-}
-#undef CASE_ALIGN_VECTORS
-
-static really_really_inline
-m128 palignr(m128 r, m128 l, int offset) {
-#if defined(HAVE__BUILTIN_CONSTANT_P)
-    if (__builtin_constant_p(offset)) {
-       return palignr_imm(r, l, offset);
-    }
-#endif
-    return palignr_sw(r, l, offset);
-}
-
-static really_inline
-m128 variable_byte_shift_m128(m128 in, s32 amount) {
-    assert(amount >= -16 && amount <= 16);
-    if (amount < 0) {
-        return palignr(zeroes128(), in, -amount);
-    } else {
-        return palignr(in, zeroes128(), 16 - amount);
-    }
-}
-/*
-static really_inline
-m128 variable_byte_shift_m128(m128 in, s32 amount) {
-    assert(amount >= -16 && amount <= 16);
-    m128 shift_mask = loadu128(vbs_mask_data + 16 - amount);
-    return pshufb_m128(in, shift_mask);
-}*/
-
-static really_inline
-m128 max_u8_m128(m128 a, m128 b) {
-    return _mm_max_epu8(a, b);
-}
-
-static really_inline
-m128 min_u8_m128(m128 a, m128 b) {
-    return _mm_min_epu8(a, b);
-}
-
-static really_inline
-m128 sadd_u8_m128(m128 a, m128 b) {
-    return _mm_adds_epu8(a, b);
-}
-
-static really_inline
-m128 sub_u8_m128(m128 a, m128 b) {
-    return _mm_sub_epi8(a, b);
-}
-
-static really_inline
-m128 set4x32(u32 x3, u32 x2, u32 x1, u32 x0) {
-    return _mm_set_epi32(x3, x2, x1, x0);
-}
-
-static really_inline
-m128 set2x64(u64a hi, u64a lo) {
-    return _mm_set_epi64x(hi, lo);
-}
-
-#endif // ARCH_SIMDE_SIMD_UTILS_H
diff --git a/src/util/arch/x86/simd_utils.h b/src/util/arch/x86/simd_utils.h
index ba2bf26f..01429cf2 100644
--- a/src/util/arch/x86/simd_utils.h
+++ b/src/util/arch/x86/simd_utils.h
@@ -112,6 +112,16 @@ static really_inline u32 diffrich64_128(m128 a, m128 b) {
 #endif
 }
 
+static really_really_inline
+m128 add_2x64(m128 a, m128 b) {
+    return (m128) _mm_add_epi64(a, b);
+}
+
+static really_really_inline
+m128 sub_2x64(m128 a, m128 b) {
+    return (m128) _mm_sub_epi64(a, b);
+}
+
 static really_really_inline
 m128 lshift64_m128(m128 a, unsigned b) {
 #if defined(HAVE__BUILTIN_CONSTANT_P)
@@ -124,8 +134,9 @@ m128 lshift64_m128(m128 a, unsigned b) {
 }
 
 #define rshift64_m128(a, b) _mm_srli_epi64((a), (b))
-#define eq128(a, b)      _mm_cmpeq_epi8((a), (b))
-#define movemask128(a)  ((u32)_mm_movemask_epi8((a)))
+#define eq128(a, b)         _mm_cmpeq_epi8((a), (b))
+#define eq64_m128(a, b)     _mm_cmpeq_epi64((a), (b))
+#define movemask128(a)      ((u32)_mm_movemask_epi8((a)))
 
 #if defined(HAVE_AVX512)
 static really_inline m128 cast512to128(const m512 in) {
@@ -668,24 +679,6 @@ m256 combine2x128(m128 hi, m128 lo) {
 }
 #endif //AVX2
 
-#if defined(HAVE_SIMD_128_BITS)
-/**
- * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit
- * mask indicating which 32-bit words contain differences.
- */
-
-static really_inline u32 diffrich384(m384 a, m384 b) {
-    m128 z = zeroes128();
-    a.lo = _mm_cmpeq_epi32(a.lo, b.lo);
-    a.mid = _mm_cmpeq_epi32(a.mid, b.mid);
-    a.hi = _mm_cmpeq_epi32(a.hi, b.hi);
-    m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.mid),
-                                  _mm_packs_epi32(a.hi, z));
-    return ~(_mm_movemask_epi8(packed)) & 0xfff;
-}
-
-#endif // HAVE_SIMD_128_BITS
-
 /****
  **** 512-bit Primitives
  ****/
diff --git a/src/util/bitutils.h b/src/util/bitutils.h
index 7e006158..c67d5a85 100644
--- a/src/util/bitutils.h
+++ b/src/util/bitutils.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2023, VectorCamp PC
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -44,13 +45,14 @@
 #define DOUBLE_CASE_CLEAR 0xdfdf
 #define OCTO_CASE_CLEAR   0xdfdfdfdfdfdfdfdfULL
 
-
+#if !defined(VS_SIMDE_BACKEND)
 #if defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "util/arch/x86/bitutils.h"
 #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
 #include "util/arch/arm/bitutils.h"
 #elif defined(ARCH_PPC64EL)
 #include "util/arch/ppc64el/bitutils.h"
+#endif
 #else
 #include "util/arch/common/bitutils.h"
 #define clz32_impl clz32_impl_c
diff --git a/src/util/match.hpp b/src/util/match.hpp
index 68497349..6567b212 100644
--- a/src/util/match.hpp
+++ b/src/util/match.hpp
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2015-2017, Intel Corporation
- * Copyright (c) 2020-2021, VectorCamp PC
+ * Copyright (c) 2020-2023, VectorCamp PC
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -49,12 +49,16 @@ const u8 *first_zero_match_inverted(const u8 *buf, SuperVector<S> v, u16 const l
 template <u16 S>
 const u8 *last_zero_match_inverted(const u8 *buf, SuperVector<S> v, u16 len = S);
 
-#if defined(ARCH_IA32) || defined(ARCH_X86_64) || defined(SIMDE_BACKEND)
+#if defined(VS_SIMDE_BACKEND)
+#include "util/arch/x86/match.hpp"
+#else
+#if defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "util/arch/x86/match.hpp"
 #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
 #include "util/arch/arm/match.hpp"
 #elif defined(ARCH_PPC64EL)
 #include "util/arch/ppc64el/match.hpp"
 #endif
+#endif
 
 #endif // MATCH_HPP
diff --git a/src/util/simd_types.h b/src/util/simd_types.h
index b9e2a492..e393d081 100644
--- a/src/util/simd_types.h
+++ b/src/util/simd_types.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2023, VectorCamp PC
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,7 +35,16 @@
 #include "util/intrinsics.h"
 #include "ue2common.h"
 
-#if defined(ARCH_IA32) || defined(ARCH_X86_64)
+#if defined(VS_SIMDE_BACKEND)
+#define VECTORSIZE 16
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#if !defined(VS_SIMDE_NATIVE)
+#define SIMDE_NO_NATIVE
+#endif
+#include <simde/x86/sse4.2.h>
+typedef simde__m128i m128;
+#define HAVE_SIMD_128_BITS
+#elif defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "util/arch/x86/simd_types.h"
 #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
 #include "util/arch/arm/simd_types.h"
@@ -42,14 +52,6 @@
 #include "util/arch/ppc64el/simd_types.h"
 #endif
 
-#if defined(SIMDE_BACKEND)
-#define VECTORSIZE 16
-#define SIMDE_ENABLE_NATIVE_ALIASES
-#define SIMDE_NO_NATIVE
-#include "simde/simde/x86/sse4.2.h"
-typedef simde__m128i m128;
-#define HAVE_SIMD_128_BITS
-#endif
 
 #if !defined(m256) && !defined(HAVE_SIMD_256_BITS)
 typedef struct ALIGN_AVX_DIRECTIVE {m128 lo; m128 hi;} m256;
diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h
index 0ed66177..01c309b1 100644
--- a/src/util/simd_utils.h
+++ b/src/util/simd_utils.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2020, Intel Corporation
+ * Copyright (c) 2023, VectorCamp PC
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -61,14 +62,16 @@ extern const char vbs_mask_data[];
 }
 #endif
 
+#if defined(VS_SIMDE_BACKEND)
+#include "util/arch/x86/simd_utils.h"
+#else
 #if defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "util/arch/x86/simd_utils.h"
 #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
 #include "util/arch/arm/simd_utils.h"
 #elif defined(ARCH_PPC64EL)
 #include "util/arch/ppc64el/simd_utils.h"
-#elif defined(SIMDE_BACKEND)
-#include "util/arch/simde/simd_utils.h"
+#endif
 #endif
 
 #include "util/arch/common/simd_utils.h"
diff --git a/src/util/supervector/arch/simde/impl.cpp b/src/util/supervector/arch/simde/impl.cpp
deleted file mode 100644
index b1c9b631..00000000
--- a/src/util/supervector/arch/simde/impl.cpp
+++ /dev/null
@@ -1,530 +0,0 @@
-/*
- * Copyright (c) 2015-2017, Intel Corporation
- * Copyright (c) 2020-2021, VectorCamp PC
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef SIMD_IMPL_HPP
-#define SIMD_IMPL_HPP
-
-#include <cstdint>
-#include <cstdio>
-
-#include "ue2common.h"
-#include "util/arch.h"
-#include "util/unaligned.h"
-#include "util/supervector/supervector.hpp"
-
-template<>
-really_inline SuperVector<16>::SuperVector(SuperVector const &other)
-{
-    u.v128[0] = other.u.v128[0];
-}
-
-template<>
-really_inline SuperVector<16>::SuperVector(typename base_type::type const v)
-{
-    u.v128[0] = v;
-};
-
-template<>
-template<>
-really_inline SuperVector<16>::SuperVector(int8_t const other)
-{
-    u.v128[0] = _mm_set1_epi8(other);
-}
-
-template<>
-template<>
-really_inline SuperVector<16>::SuperVector(uint8_t const other)
-{
-    u.v128[0] = _mm_set1_epi8(static_cast<int8_t>(other));
-}
-
-template<>
-template<>
-really_inline SuperVector<16>::SuperVector(int16_t const other)
-{
-    u.v128[0] = _mm_set1_epi16(other);
-}
-
-template<>
-template<>
-really_inline SuperVector<16>::SuperVector(uint16_t const other)
-{
-    u.v128[0] = _mm_set1_epi16(static_cast<int16_t>(other));
-}
-
-template<>
-template<>
-really_inline SuperVector<16>::SuperVector(int32_t const other)
-{
-    u.v128[0] = _mm_set1_epi32(other);
-}
-
-template<>
-template<>
-really_inline SuperVector<16>::SuperVector(uint32_t const other)
-{
-    u.v128[0] = _mm_set1_epi32(static_cast<int32_t>(other));
-}
-
-template<>
-template<>
-really_inline SuperVector<16>::SuperVector(int64_t const other)
-{
-    u.v128[0] = _mm_set1_epi64x(other);
-}
-
-template<>
-template<>
-really_inline SuperVector<16>::SuperVector(uint64_t const other)
-{
-    u.v128[0] = _mm_set1_epi64x(static_cast<int64_t>(other));
-}
-
-// Constants
-template<>
-really_inline SuperVector<16> SuperVector<16>::Ones()
-{
-    return {_mm_set1_epi8(0xFF)};
-}
-
-template<>
-really_inline SuperVector<16> SuperVector<16>::Zeroes(void)
-{
-    return {_mm_set1_epi8(0)};
-}
-
-// Methods
-
-template <>
-really_inline void SuperVector<16>::operator=(SuperVector<16> const &other)
-{
-    u.v128[0] = other.u.v128[0];
-}
-
-template <>
-really_inline SuperVector<16> SuperVector<16>::operator&(SuperVector<16> const &b) const
-{
-    return {_mm_and_si128(u.v128[0], b.u.v128[0])};
-}
-
-template <>
-really_inline SuperVector<16> SuperVector<16>::operator|(SuperVector<16> const &b) const
-{
-    return {_mm_or_si128(u.v128[0], b.u.v128[0])};
-}
-
-template <>
-really_inline SuperVector<16> SuperVector<16>::operator^(SuperVector<16> const &b) const
-{
-    return {_mm_xor_si128(u.v128[0], b.u.v128[0])};
-}
-
-template <>
-really_inline SuperVector<16> SuperVector<16>::operator!() const
-{
-    return {_mm_xor_si128(u.v128[0], u.v128[0])};
-}
-
-template <>
-really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const &b) const
-{
-    return {_mm_andnot_si128(u.v128[0], b.u.v128[0])};
-}
-
-template <>
-really_inline SuperVector<16> SuperVector<16>::operator==(SuperVector<16> const &b) const
-{
-    return {_mm_cmpeq_epi8(u.v128[0], b.u.v128[0])};
-}
-
-template <>
-really_inline SuperVector<16> SuperVector<16>::operator!=(SuperVector<16> const &b) const
-{
-    return !(*this == b);
-}
-
-template <>
-really_inline SuperVector<16> SuperVector<16>::operator>(SuperVector<16> const &b) const
-{
-    return {_mm_cmpgt_epi8(u.v128[0], b.u.v128[0])};
-}
-
-template <>
-really_inline SuperVector<16> SuperVector<16>::operator<(SuperVector<16> const &b) const
-{
-    return {_mm_cmplt_epi8(u.v128[0], b.u.v128[0])};
-}
-
-template <>
-really_inline SuperVector<16> SuperVector<16>::operator>=(SuperVector<16> const &b) const
-{
-    return !(*this < b);
-}
-
-template <>
-really_inline SuperVector<16> SuperVector<16>::operator<=(SuperVector<16> const &b) const
-{
-    return !(*this > b);
-}
-
-template <>
-really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) const
-{
-    return (*this == b);
-}
-
-template <>
-really_inline typename SuperVector<16>::comparemask_type
-SuperVector<16>::comparemask(void) const {
-    return (u32)_mm_movemask_epi8(u.v128[0]);
-}
-
-template <>
-really_inline typename SuperVector<16>::comparemask_type
-SuperVector<16>::eqmask(SuperVector<16> const b) const {
-    return eq(b).comparemask();
-}
-
-template <> really_inline u32 SuperVector<16>::mask_width() { return 1; }
-
-template <>
-really_inline typename SuperVector<16>::comparemask_type
-SuperVector<16>::iteration_mask(
-    typename SuperVector<16>::comparemask_type mask) {
-    return mask;
-}
-
-// template <>
-// template<uint8_t N>
-// really_inline SuperVector<16> SuperVector<16>::vshl_8_imm() const
-// {
-//     const uint8_t i = N;
-//     return {_mm_slli_epi8(u.v128[0], i)};
-// }
-
-template <>
-template<uint8_t N>
-really_inline SuperVector<16> SuperVector<16>::vshl_16_imm() const
-{
-    return {_mm_slli_epi16(u.v128[0], N)};
-}
-
-template <>
-template<uint8_t N>
-really_inline SuperVector<16> SuperVector<16>::vshl_32_imm() const
-{
-    return {_mm_slli_epi32(u.v128[0], N)};
-}
-
-template <>
-template<uint8_t N>
-really_inline SuperVector<16> SuperVector<16>::vshl_64_imm() const
-{
-    return {_mm_slli_epi64(u.v128[0], N)};
-}
-
-template <>
-template<uint8_t N>
-really_inline SuperVector<16> SuperVector<16>::vshl_128_imm() const
-{
-    return {_mm_slli_si128(u.v128[0], N)};
-}
-
-template <>
-template<uint8_t N>
-really_inline SuperVector<16> SuperVector<16>::vshl_imm() const
-{
-    return vshl_128_imm<N>();
-}
-
-// template <>
-// template<uint8_t N>
-// really_inline SuperVector<16> SuperVector<16>::vshr_8_imm() const
-// {
-//     return {_mm_srli_epi8(u.v128[0], N)};
-// }
-
-template <>
-template<uint8_t N>
-really_inline SuperVector<16> SuperVector<16>::vshr_16_imm() const
-{
-    return {_mm_srli_epi16(u.v128[0], N)};
-}
-
-template <>
-template<uint8_t N>
-really_inline SuperVector<16> SuperVector<16>::vshr_32_imm() const
-{
-    return {_mm_srli_epi32(u.v128[0], N)};
-}
-  
-template <>
-template<uint8_t N>
-really_inline SuperVector<16> SuperVector<16>::vshr_64_imm() const
-{
-    return {_mm_srli_epi64(u.v128[0], N)};
-}
-
-template <>
-template<uint8_t N>
-really_inline SuperVector<16> SuperVector<16>::vshr_128_imm() const
-{
-    return {_mm_srli_si128(u.v128[0], N)};
-}
-
-template <>
-template<uint8_t N>
-really_inline SuperVector<16> SuperVector<16>::vshr_imm() const
-{
-    return vshr_128_imm<N>();
-}
-
-#if !defined(HS_OPTIMIZE)
-template SuperVector<16> SuperVector<16>::vshl_16_imm<1>() const;
-template SuperVector<16> SuperVector<16>::vshl_64_imm<1>() const;
-template SuperVector<16> SuperVector<16>::vshl_64_imm<4>() const;
-template SuperVector<16> SuperVector<16>::vshl_128_imm<1>() const;
-template SuperVector<16> SuperVector<16>::vshl_128_imm<4>() const;
-template SuperVector<16> SuperVector<16>::vshr_16_imm<1>() const;
-template SuperVector<16> SuperVector<16>::vshr_64_imm<1>() const;
-template SuperVector<16> SuperVector<16>::vshr_64_imm<4>() const;
-template SuperVector<16> SuperVector<16>::vshr_128_imm<1>() const;
-template SuperVector<16> SuperVector<16>::vshr_128_imm<4>() const;
-#endif
-
-// template <>
-// really_inline SuperVector<16> SuperVector<16>::vshl_8  (uint8_t const N) const
-// {
-//     Unroller<0, 15>::iterator([&,v=this](int i) { if (N == i) return {_mm_slli_epi8(v->u.v128[0], i)}; });
-//     if (N == 16) return Zeroes();
-// }
-
-template <>
-really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const N) const
-{
-#if defined(HAVE__BUILTIN_CONSTANT_P)
-    if (__builtin_constant_p(N)) {
-        return {_mm_slli_epi16(u.v128[0], N)};
-    }
-#endif
-    if (N == 0) return *this;
-    if (N == 16) return Zeroes();
-    SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_slli_epi16(v->u.v128[0], n)}; });
-    return result;
-}
-
-template <>
-really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const
-{
-    if (N == 0) return *this;
-    if (N == 16) return Zeroes();
-    SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_slli_epi32(v->u.v128[0], n)}; });
-    return result;
-}
-
-template <>
-really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const
-{
-    if (N == 0) return *this;
-    if (N == 16) return Zeroes();
-    SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_slli_epi64(v->u.v128[0], n)}; });
-    return result;
-}
-
-template <>
-really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const
-{
-    if (N == 0) return *this;
-    if (N == 16) return Zeroes();
-    SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_slli_si128(v->u.v128[0], n)}; });
-    return result;
-}
-
-template <>
-really_inline SuperVector<16> SuperVector<16>::vshl(uint8_t const N) const
-{
-    return vshl_128(N);
-}
-
-// template <>
-// really_inline SuperVector<16> SuperVector<16>::vshr_8  (uint8_t const N) const
-// {
-//     SuperVector<16> result;
-//     Unroller<0, 15>::iterator([&,v=this](uint8_t const i) { if (N == i) result = {_mm_srli_epi8(v->u.v128[0], i)}; });
-//     if (N == 16) result = Zeroes();
-//     return result;
-// }
-
-template <>
-really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const
-{
-    if (N == 0) return *this;
-    if (N == 16) return Zeroes();
-    SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_srli_epi16(v->u.v128[0], n)}; });
-    return result;
-}
-
-template <>
-really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const
-{
-    if (N == 0) return *this;
-    if (N == 16) return Zeroes();
-    SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_srli_epi32(v->u.v128[0], n)}; });
-    return result;
-}
-
-template <>
-really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const
-{
-    if (N == 0) return *this;
-    if (N == 16) return Zeroes();
-    SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_srli_epi64(v->u.v128[0], n)}; });
-    return result;
-}
-
-template <>
-really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const N) const
-{
-    if (N == 0) return *this;
-    if (N == 16) return Zeroes();
-    SuperVector result;
-    Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_srli_si128(v->u.v128[0], n)}; });
-    return result;
-}
-
-template <>
-really_inline SuperVector<16> SuperVector<16>::vshr(uint8_t const N) const
-{
-    return vshr_128(N);
-}
-
-template <>
-really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
-{
-    return vshr_128(N);
-}
-
-template <>
-really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
-{
-    return vshl_128(N);
-}
-
-template<>
-really_inline SuperVector<16> SuperVector<16>::Ones_vshr(uint8_t const N)
-{
-    if (N == 0) return Ones();
-    else return Ones().vshr_128(N);
-}
-
-template<>
-really_inline SuperVector<16> SuperVector<16>::Ones_vshl(uint8_t const N)
-{
-    if (N == 0) return Ones();
-    else return Ones().vshr_128(N);
-}
-
-template <>
-really_inline SuperVector<16> SuperVector<16>::loadu(void const *ptr)
-{
-    return _mm_loadu_si128((const m128 *)ptr);
-}
-
-template <>
-really_inline SuperVector<16> SuperVector<16>::load(void const *ptr)
-{
-    assert(ISALIGNED_N(ptr, alignof(SuperVector::size)));
-    ptr = vectorscan_assume_aligned(ptr, SuperVector::size);
-    return _mm_load_si128((const m128 *)ptr);
-}
-
-template <>
-really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len)
-{
-    SuperVector mask = Ones_vshr(16 -len);
-    SuperVector v = _mm_loadu_si128((const m128 *)ptr);
-    return mask & v;
-}
-
-template<>
-really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset)
-{
-#if defined(HAVE__BUILTIN_CONSTANT_P)
-    if (__builtin_constant_p(offset)) {
-        if (offset == 16) {
-            return *this;
-        } else {
-            return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], offset)};
-        }
-    }
-#endif
-    switch(offset) {
-    case 0: return other; break;
-    case 1: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 1)}; break;
-    case 2: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 2)}; break;
-    case 3: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 3)}; break;
-    case 4: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 4)}; break;
-    case 5: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 5)}; break;
-    case 6: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 6)}; break;
-    case 7: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 7)}; break;
-    case 8: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 8)}; break;
-    case 9: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 9)}; break;
-    case 10: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 10)}; break;
-    case 11: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 11)}; break;
-    case 12: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 12)}; break;
-    case 13: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 13)}; break;
-    case 14: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 14)}; break;
-    case 15: return {_mm_alignr_epi8(u.v128[0], other.u.v128[0], 15)}; break;
-    default: break;
-    }
-    return *this;
-}
-
-template<>
-template<>
-really_inline SuperVector<16> SuperVector<16>::pshufb<true>(SuperVector<16> b)
-{
-    return {_mm_shuffle_epi8(u.v128[0], b.u.v128[0])};
-}
-
-template<>
-really_inline SuperVector<16> SuperVector<16>::pshufb_maskz(SuperVector<16> b, uint8_t const len)
-{
-    SuperVector mask = Ones_vshr(16 -len);
-    return mask & pshufb(b);
-}
-
-#endif // SIMD_IMPL_HPP
diff --git a/src/util/supervector/supervector.hpp b/src/util/supervector/supervector.hpp
index 730a6fd2..253907fa 100644
--- a/src/util/supervector/supervector.hpp
+++ b/src/util/supervector/supervector.hpp
@@ -34,6 +34,9 @@
 #include <cstdio>
 #include <type_traits>
 
+#if defined(VS_SIMDE_BACKEND)
+#include "util/supervector/arch/x86/types.hpp"
+#else
 #if defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "util/supervector/arch/x86/types.hpp"
 #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
@@ -41,6 +44,7 @@
 #elif defined(ARCH_PPC64EL)
 #include "util/supervector/arch/ppc64el/types.hpp"
 #endif
+#endif // VS_SIMDE_BACKEND
 
 #if defined(HAVE_SIMD_512_BITS)
 using Z_TYPE = u64a;
@@ -57,7 +61,7 @@ using Z_TYPE = u32;
 #define DOUBLE_LOAD_MASK(l)        (((1ULL) << (l)) - 1ULL)
 #define SINGLE_LOAD_MASK(l)        (((1ULL) << (l)) - 1ULL)
 #elif defined(HAVE_SIMD_128_BITS)
-#if defined(ARCH_ARM32) || defined(ARCH_AARCH64)
+#if !defined(VS_SIMDE_BACKEND) && (defined(ARCH_ARM32) || defined(ARCH_AARCH64))
 using Z_TYPE = u64a;
 #define Z_BITS 64
 #define Z_POSSHIFT 2
@@ -175,7 +179,7 @@ public:
     typename BaseVector<32>::type ALIGN_ATTR(BaseVector<32>::size) v256[SIZE / BaseVector<32>::size];
     typename BaseVector<64>::type ALIGN_ATTR(BaseVector<64>::size) v512[SIZE / BaseVector<64>::size];
 
-#if defined(ARCH_ARM32) || defined(ARCH_AARCH64) || defined(ARCH_PPC64EL)
+#if !defined(VS_SIMDE_BACKEND) && (defined(ARCH_ARM32) || defined(ARCH_AARCH64) || defined(ARCH_PPC64EL))
     uint64x2_t ALIGN_ATTR(BaseVector<16>::size) u64x2[SIZE / BaseVector<16>::size];
     int64x2_t ALIGN_ATTR(BaseVector<16>::size) s64x2[SIZE / BaseVector<16>::size];
     uint32x4_t ALIGN_ATTR(BaseVector<16>::size) u32x4[SIZE / BaseVector<16>::size];
@@ -382,14 +386,16 @@ struct Unroller<End, End>
 };
 
 #if defined(HS_OPTIMIZE)
+#if defined(VS_SIMDE_BACKEND)
+#include "util/supervector/arch/x86/impl.cpp"
+#else
 #if defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "util/supervector/arch/x86/impl.cpp"
 #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
 #include "util/supervector/arch/arm/impl.cpp"
 #elif defined(ARCH_PPC64EL)
 #include "util/supervector/arch/ppc64el/impl.cpp"
-#elif defined(SIMDE_BACKEND)
-#include "util/supervector/arch/simde/impl.cpp"
+#endif
 #endif
 #endif
 
diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp
index a9737bd2..272d5456 100644
--- a/unit/internal/simd_utils.cpp
+++ b/unit/internal/simd_utils.cpp
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2023, VectorCamp PC
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -667,7 +668,7 @@ TEST(SimdUtilsTest, movq) {
     ASSERT_EQ(0, memcmp(cmp, &r, sizeof(r)));
 
 #if defined(HAVE_SIMD_128_BITS)
-#if defined(ARCH_IA32) || defined(ARCH_X86_64) || defined(SIMDE_BACKEND)
+#if defined(ARCH_IA32) || defined(ARCH_X86_64) || defined(VS_SIMDE_BACKEND)
     simd = _mm_set_epi64x(~0LL, 0x123456789abcdef);
 #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
     int64x2_t a = { 0x123456789abcdefLL, ~0LL };

From 8c7b503ac49899b8f85ff23c05594fa6c53956cf Mon Sep 17 00:00:00 2001
From: Konstantnos Margaritis <konstantinos@vectorcamp.gr>
Date: Mon, 27 Nov 2023 20:51:29 +0000
Subject: [PATCH 14/22] fix TUNE_FLAG for SIMDE_BACKEND

---
 cmake/archdetect.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/archdetect.cmake b/cmake/archdetect.cmake
index 87c4c4e7..494269c2 100644
--- a/cmake/archdetect.cmake
+++ b/cmake/archdetect.cmake
@@ -69,7 +69,7 @@ if (USE_CPU_NATIVE)
 else()
     if (SIMDE_BACKEND)
         set(GNUCC_ARCH native)
-        set(TUNE_FLAG generic)
+        set(TUNE_FLAG native)
     elseif (ARCH_IA32 OR ARCH_X86_64)
         set(GNUCC_ARCH native)
         set(TUNE_FLAG generic)

From 23aeaecf53c9edec29dcf8702387b46cad56e081 Mon Sep 17 00:00:00 2001
From: Konstantnos Margaritis <konstantinos@vectorcamp.gr>
Date: Mon, 27 Nov 2023 20:51:47 +0000
Subject: [PATCH 15/22] use pkg-config for SIMDe

---
 cmake/simde.cmake | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/cmake/simde.cmake b/cmake/simde.cmake
index 12c56c6d..bf9766b6 100644
--- a/cmake/simde.cmake
+++ b/cmake/simde.cmake
@@ -1,9 +1,15 @@
 # include_directories(${PROJECT_SOURCE_DIR}/simde/simde)
 
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DVS_SIMDE_BACKEND")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DVS_SIMDE_BACKEND")
+pkg_check_modules(SIMDE simde)
 
-if (SIMDE_NATIVE)
-  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DVS_SIMDE_NATIVE -DSIMDE_ENABLE_OPENMP -fopenmp-simd")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DVS_SIMDE_NATIVE -DSIMDE_ENABLE_OPENMP -fopenmp-simd")
+if (SIMDE_FOUND)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DVS_SIMDE_BACKEND")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DVS_SIMDE_BACKEND")
+
+  if (SIMDE_NATIVE)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DVS_SIMDE_NATIVE -DSIMDE_ENABLE_OPENMP -fopenmp-simd")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DVS_SIMDE_NATIVE -DSIMDE_ENABLE_OPENMP -fopenmp-simd")
+  endif()
+else()
+  message(FATAL_ERROR "SIMDe backend requested but SIMDe is not available on the system")
 endif()

From f5e508b13fcef92870028a5ea4c5543a5a962b7d Mon Sep 17 00:00:00 2001
From: Konstantnos Margaritis <konstantinos@vectorcamp.gr>
Date: Mon, 27 Nov 2023 20:52:52 +0000
Subject: [PATCH 16/22] fix compilation for SIMDe

---
 src/util/arch/x86/simd_utils.h         | 4 ++--
 src/util/supervector/arch/x86/impl.cpp | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/util/arch/x86/simd_utils.h b/src/util/arch/x86/simd_utils.h
index 01429cf2..49797aba 100644
--- a/src/util/arch/x86/simd_utils.h
+++ b/src/util/arch/x86/simd_utils.h
@@ -180,7 +180,7 @@ m128 load_m128_from_u64a(const u64a *p) {
 
 static really_inline
 m128 rshiftbyte_m128(const m128 a, int count_immed) {
-#if defined(HAVE__BUILTIN_CONSTANT_P)
+#if defined(HAVE__BUILTIN_CONSTANT_P) && !defined(VS_SIMDE_BACKEND)
     if (__builtin_constant_p(count_immed)) {
         return _mm_srli_si128(a, count_immed);
     }
@@ -211,7 +211,7 @@ m128 rshiftbyte_m128(const m128 a, int count_immed) {
 
 static really_inline
 m128 lshiftbyte_m128(const m128 a, int count_immed) {
-#if defined(HAVE__BUILTIN_CONSTANT_P)
+#if defined(HAVE__BUILTIN_CONSTANT_P) && !defined(VS_SIMDE_BACKEND)
     if (__builtin_constant_p(count_immed)) {
         return _mm_slli_si128(a, count_immed);
     }
diff --git a/src/util/supervector/arch/x86/impl.cpp b/src/util/supervector/arch/x86/impl.cpp
index 3d232e49..b8a75c95 100644
--- a/src/util/supervector/arch/x86/impl.cpp
+++ b/src/util/supervector/arch/x86/impl.cpp
@@ -376,7 +376,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const
 template <>
 really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const
 {
-#if defined(HAVE__BUILTIN_CONSTANT_P)
+#if defined(HAVE__BUILTIN_CONSTANT_P) && !defined(VS_SIMDE_BACKEND)
     if (__builtin_constant_p(N)) {
         return {_mm_slli_si128(u.v128[0], N)};
     }
@@ -451,7 +451,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const
 template <>
 really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const N) const
 {
-#if defined(HAVE__BUILTIN_CONSTANT_P)
+#if defined(HAVE__BUILTIN_CONSTANT_P) && !defined(VS_SIMDE_BACKEND)
     if (__builtin_constant_p(N)) {
         return {_mm_srli_si128(u.v128[0], N)};
     }
@@ -472,7 +472,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr(uint8_t const N) const
 template <>
 really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
 {
-#if defined(HAVE__BUILTIN_CONSTANT_P)
+#if defined(HAVE__BUILTIN_CONSTANT_P) && !defined(VS_SIMDE_BACKEND)
     if (__builtin_constant_p(N)) {
         return {_mm_srli_si128(u.v128[0], N)};
     }
@@ -483,7 +483,7 @@ really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
 template <>
 really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
 {
-#if defined(HAVE__BUILTIN_CONSTANT_P)
+#if defined(HAVE__BUILTIN_CONSTANT_P) && !defined(VS_SIMDE_BACKEND)
     if (__builtin_constant_p(N)) {
         return {_mm_slli_si128(u.v128[0], N)};
     }

From be9ce687677bfea43b2e49fc4349b7eebf6312cd Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konma@vectorcamp.gr>
Date: Tue, 28 Nov 2023 12:06:46 +0000
Subject: [PATCH 17/22] make diffrich384 available on all arches

---
 src/util/arch/common/simd_utils.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/util/arch/common/simd_utils.h b/src/util/arch/common/simd_utils.h
index 89190648..24331b10 100644
--- a/src/util/arch/common/simd_utils.h
+++ b/src/util/arch/common/simd_utils.h
@@ -455,7 +455,6 @@ static really_inline int isnonzero384(m384 a) {
     return isnonzero128(or128(or128(a.lo, a.mid), a.hi));
 }
 
-#if defined(HAVE_SIMD_128_BITS) && !defined(ARCH_IA32) && !defined(ARCH_X86_64)
 /**
  * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit
  * mask indicating which 32-bit words contain differences.
@@ -464,7 +463,6 @@ static really_inline
 u32 diffrich384(m384 a, m384 b) {
     return diffrich128(a.lo, b.lo) | (diffrich128(a.mid, b.mid) << 4) | (diffrich128(a.hi, b.hi) << 8);
 }
-#endif
 
 /**
  * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and

From 3beda7e5e0aec799f6740955f570ae25d0703f12 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Tue, 28 Nov 2023 14:09:26 +0200
Subject: [PATCH 18/22] add missing else

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7ca7b994..fbe8e36e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -131,6 +131,7 @@ elseif (ARCH_ARM32 OR ARCH_AARCH64)
 elseif (ARCH_PPC64EL)
     include (${CMAKE_MODULE_PATH}/cflags-ppc64le.cmake)
     set(ARCH_FLAG mcpu)
+else()
     message(FATAL_ERROR "Unsupported platform")
 endif ()
 

From 6332cb91f56b68667b86970b81599a76158300e8 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Tue, 28 Nov 2023 17:28:48 +0200
Subject: [PATCH 19/22] separate ARCH_FLAG logic

---
 CMakeLists.txt | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fbe8e36e..74b1f6f7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -121,20 +121,22 @@ include (${CMAKE_MODULE_PATH}/osdetection.cmake)
 
 if(SIMDE_BACKEND)
     include (${CMAKE_MODULE_PATH}/simde.cmake)
-    set(ARCH_FLAG march)
 elseif (ARCH_IA32 OR ARCH_X86_64)
     include (${CMAKE_MODULE_PATH}/cflags-x86.cmake)
-    set(ARCH_FLAG march)
 elseif (ARCH_ARM32 OR ARCH_AARCH64)
     include (${CMAKE_MODULE_PATH}/cflags-arm.cmake)
-    set(ARCH_FLAG march)
 elseif (ARCH_PPC64EL)
     include (${CMAKE_MODULE_PATH}/cflags-ppc64le.cmake)
-    set(ARCH_FLAG mcpu)
-else()
+else ()
     message(FATAL_ERROR "Unsupported platform")
 endif ()
 
+if (ARCH_PPC64EL)
+    set(ARCH_FLAG mcpu)
+else ()
+    set(ARCH_FLAG march)
+endif ()
+
 # Detect Native arch flags if requested
 include (${CMAKE_MODULE_PATH}/archdetect.cmake)
 

From 9fd0ce5d444770248fbb5330fc9c7a561be5ef23 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Tue, 28 Nov 2023 17:39:55 +0200
Subject: [PATCH 20/22] search for SIMDE sse4.2.h header

---
 cmake/simde.cmake | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cmake/simde.cmake b/cmake/simde.cmake
index bf9766b6..8cac2bdd 100644
--- a/cmake/simde.cmake
+++ b/cmake/simde.cmake
@@ -1,8 +1,8 @@
-# include_directories(${PROJECT_SOURCE_DIR}/simde/simde)
+include_directories(${PROJECT_SOURCE_DIR}/simde/simde)
 
-pkg_check_modules(SIMDE simde)
+CHECK_INCLUDE_FILES("simde/x86/sse4.2.h" SIMDE_SSE42_H_FOUND)
 
-if (SIMDE_FOUND)
+if (SIMDE_SSE42_H_FOUND)
   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DVS_SIMDE_BACKEND")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DVS_SIMDE_BACKEND")
 

From d3f6d2ad0616a84e1c4672379f9a407f90922160 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Tue, 28 Nov 2023 18:27:08 +0200
Subject: [PATCH 21/22] updates to the Readme

---
 README.md | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 095ab8ba..7f7c2f53 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,12 @@
 # About Vectorscan
 
 A fork of Intel's Hyperscan, modified to run on more platforms. Currently ARM NEON/ASIMD
-is 100% functional, and Power VSX are in development. ARM SVE2 support is in ongoing with
+and Power VSX are 100% functional. ARM SVE2 support is in ongoing with
 access to hardware now. More platforms will follow in the future.
+Further more, starting 5.4.12 there is now a [SIMDe](https://github.com/simd-everywhere/simde)
+port, which can be either used for platforms without official SIMD support,
+as SIMDe can emulate SIMD instructions, or as an alternative backend for existing architectures,
+for reference and comparison purposes.
 
 Vectorscan will follow Intel's API and internal algorithms where possible, but will not
 hesitate to make code changes where it is thought of giving better performance or better
@@ -148,6 +152,11 @@ Common options for Cmake are:
 
 * `SANITIZE=[address|memory|undefined]` (experimental) Use `libasan` sanitizer to detect possible bugs. For now only `address` is tested. This will eventually be integrated in the CI.
 
+## SIMDe options
+
+* `SIMDE_BACKEND=[On|Off]` Enable SIMDe backend. If this is chosen all native (SSE/AVX/AVX512/Neon/SVE/VSX) backends will be disabled and a SIMDe SSE4.2 emulation backend will be enabled. This will enable Vectorscan to build and run on architectures without SIMD.
+* `SIMDE_NATIVE=[On|Off]` Enable SIMDe native emulation of x86 SSE4.2 intrinsics on the building platform. That is, SSE4.2 intrinsics will be emulated using Neon on an Arm platform, or VSX on a Power platform, etc.
+
 ## Build
 
 If `cmake` has completed successfully you can run `make` in the same directory, if you have a multi-core system with `N` cores, running
@@ -177,4 +186,4 @@ the [Developer Reference Guide](http://intel.github.io/hyperscan/dev-reference/)
 
 And you can find the source code [on Github](https://github.com/intel/hyperscan).
 
-For Intel Hyperscan related issues and questions, please follow the relevant links there.
\ No newline at end of file
+For Intel Hyperscan related issues and questions, please follow the relevant links there.

From 519bd64c65138ee4896b4f780097ecda506671e8 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Wed, 29 Nov 2023 01:39:05 +0200
Subject: [PATCH 22/22] fix failing allbits test for ppc64le on clang15

---
 src/util/bitfield.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/util/bitfield.h b/src/util/bitfield.h
index 202232b6..4a3fbd6e 100644
--- a/src/util/bitfield.h
+++ b/src/util/bitfield.h
@@ -138,8 +138,8 @@ public:
 
     /// Flip all bits.
     void flip() {
-        for (auto &e : bits) {
-            e = ~e;
+        for (size_t i = 0; i < size(); i++) {
+            flip(i);
         }
         clear_trailer();
     }