diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 00000000..8dd6c091
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "simde"]
+	path = simde
+	url = https://github.com/simd-everywhere/simde.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1db128ba..30c8663e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -119,15 +119,22 @@ endif()
 # Detect OS and if Fat Runtime is available
 include (${CMAKE_MODULE_PATH}/osdetection.cmake)
 
-if (ARCH_IA32 OR ARCH_X86_64)
+if(SIMDE_BACKEND)
+    include (${CMAKE_MODULE_PATH}/simde.cmake)
+elseif (ARCH_IA32 OR ARCH_X86_64)
     include (${CMAKE_MODULE_PATH}/cflags-x86.cmake)
-    set(ARCH_FLAG march)
 elseif (ARCH_ARM32 OR ARCH_AARCH64)
     include (${CMAKE_MODULE_PATH}/cflags-arm.cmake)
-    set(ARCH_FLAG march)
 elseif (ARCH_PPC64EL)
     include (${CMAKE_MODULE_PATH}/cflags-ppc64le.cmake)
+else ()
+    message(FATAL_ERROR "Unsupported platform")
+endif ()
+
+if (ARCH_PPC64EL)
     set(ARCH_FLAG mcpu)
+else ()
+    set(ARCH_FLAG march)
 endif ()
 
 # Detect Native arch flags if requested
@@ -239,8 +246,11 @@ set (hs_exec_common_SRCS
     src/util/arch/common/cpuid_flags.h
     src/util/multibit.c
     )
-
-if (ARCH_IA32 OR ARCH_X86_64)
+if (SIMDE_BACKEND)
+set (hs_exec_common_SRCS
+    ${hs_exec_common_SRCS}
+    src/util/arch/simde/cpuid_flags.c)
+elseif (ARCH_IA32 OR ARCH_X86_64)
 set (hs_exec_common_SRCS
     ${hs_exec_common_SRCS}
     src/util/arch/x86/cpuid_flags.c
@@ -398,7 +408,12 @@ set (hs_exec_SRCS
     src/database.h
 )
 
-if (ARCH_IA32 OR ARCH_X86_64)
+if (SIMDE_BACKEND)
+set (hs_exec_SRCS
+    ${hs_exec_SRCS}
+    src/nfa/vermicelli_simd.cpp
+    src/util/supervector/arch/x86/impl.cpp)
+elseif (ARCH_IA32 OR ARCH_X86_64)
 set (hs_exec_SRCS
     ${hs_exec_SRCS}
     src/nfa/vermicelli_simd.cpp
diff --git a/README.md b/README.md
index 095ab8ba..7f7c2f53 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,12 @@
 # About Vectorscan
 
 A fork of Intel's Hyperscan, modified to run on more platforms. Currently ARM NEON/ASIMD
-is 100% functional, and Power VSX are in development. ARM SVE2 support is in ongoing with
+and Power VSX are 100% functional. ARM SVE2 support is in ongoing with
 access to hardware now. More platforms will follow in the future.
+Further more, starting 5.4.12 there is now a [SIMDe](https://github.com/simd-everywhere/simde)
+port, which can be either used for platforms without official SIMD support,
+as SIMDe can emulate SIMD instructions, or as an alternative backend for existing architectures,
+for reference and comparison purposes.
 
 Vectorscan will follow Intel's API and internal algorithms where possible, but will not
 hesitate to make code changes where it is thought of giving better performance or better
@@ -148,6 +152,11 @@ Common options for Cmake are:
 
 * `SANITIZE=[address|memory|undefined]` (experimental) Use `libasan` sanitizer to detect possible bugs. For now only `address` is tested. This will eventually be integrated in the CI.
 
+## SIMDe options
+
+* `SIMDE_BACKEND=[On|Off]` Enable SIMDe backend. If this is chosen all native (SSE/AVX/AVX512/Neon/SVE/VSX) backends will be disabled and a SIMDe SSE4.2 emulation backend will be enabled. This will enable Vectorscan to build and run on architectures without SIMD.
+* `SIMDE_NATIVE=[On|Off]` Enable SIMDe native emulation of x86 SSE4.2 intrinsics on the building platform. That is, SSE4.2 intrinsics will be emulated using Neon on an Arm platform, or VSX on a Power platform, etc.
+
 ## Build
 
 If `cmake` has completed successfully you can run `make` in the same directory, if you have a multi-core system with `N` cores, running
@@ -177,4 +186,4 @@ the [Developer Reference Guide](http://intel.github.io/hyperscan/dev-reference/)
 
 And you can find the source code [on Github](https://github.com/intel/hyperscan).
 
-For Intel Hyperscan related issues and questions, please follow the relevant links there.
\ No newline at end of file
+For Intel Hyperscan related issues and questions, please follow the relevant links there.
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 90c685c4..63391a68 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -1,4 +1,7 @@
-if (NOT FAT_RUNTIME AND (BUILD_STATIC_AND_SHARED OR BUILD_STATIC_LIBS))
+include_directories(SYSTEM ${CMAKE_CURRENT_SOURCE_DIR})
+include_directories(${PROJECT_SOURCE_DIR})
+
+if (NOT FAT_RUNTIME AND (BUILD_SHARED_LIBS OR BUILD_STATIC_LIBS))
   add_executable(benchmarks benchmarks.cpp)
   set_source_files_properties(benchmarks.cpp PROPERTIES COMPILE_FLAGS
       "-Wall -Wno-unused-variable")
diff --git a/cmake/archdetect.cmake b/cmake/archdetect.cmake
index 015140fe..494269c2 100644
--- a/cmake/archdetect.cmake
+++ b/cmake/archdetect.cmake
@@ -67,7 +67,10 @@ if (USE_CPU_NATIVE)
         message(STATUS "clang will tune for ${GNUCC_ARCH}, ${TUNE_FLAG}")
     endif()
 else()
-    if (ARCH_IA32 OR ARCH_X86_64)
+    if (SIMDE_BACKEND)
+        set(GNUCC_ARCH native)
+        set(TUNE_FLAG native)
+    elseif (ARCH_IA32 OR ARCH_X86_64)
         set(GNUCC_ARCH native)
         set(TUNE_FLAG generic)
     elseif(ARCH_AARCH64)
@@ -84,8 +87,11 @@ else()
     elseif(ARCH_ARM32)
        set(GNUCC_ARCH armv7a)
        set(TUNE_FLAG generic)
+    elseif(ARCH_PPC64EL)
+       set(GNUCC_ARCH power8)
+       set(TUNE_FLAG power8)
     else()
-       set(GNUCC_ARCH power9)
-       set(TUNE_FLAG power9)
+       set(GNUCC_ARCH native)
+       set(TUNE_FLAG native)
     endif()
 endif()
diff --git a/cmake/simde.cmake b/cmake/simde.cmake
new file mode 100644
index 00000000..8cac2bdd
--- /dev/null
+++ b/cmake/simde.cmake
@@ -0,0 +1,15 @@
+include_directories(${PROJECT_SOURCE_DIR}/simde/simde)
+
+CHECK_INCLUDE_FILES("simde/x86/sse4.2.h" SIMDE_SSE42_H_FOUND)
+
+if (SIMDE_SSE42_H_FOUND)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DVS_SIMDE_BACKEND")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DVS_SIMDE_BACKEND")
+
+  if (SIMDE_NATIVE)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DVS_SIMDE_NATIVE -DSIMDE_ENABLE_OPENMP -fopenmp-simd")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DVS_SIMDE_NATIVE -DSIMDE_ENABLE_OPENMP -fopenmp-simd")
+  endif()
+else()
+  message(FATAL_ERROR "SIMDe backend requested but SIMDe is not available on the system")
+endif()
diff --git a/simde b/simde
new file mode 160000
index 00000000..aae22459
--- /dev/null
+++ b/simde
@@ -0,0 +1 @@
+Subproject commit aae22459fa284e9fc2b7d4b8e4571afa0418125f
diff --git a/src/hs_valid_platform.c b/src/hs_valid_platform.c
index 0af36b6c..74a8fc1e 100644
--- a/src/hs_valid_platform.c
+++ b/src/hs_valid_platform.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2016-2017, Intel Corporation
+ * Copyright (c) 2020-2023, VectorCamp PC
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -29,28 +30,30 @@
 #include "config.h"
 #include "hs_common.h"
 #include "ue2common.h"
+#if !defined(VS_SIMDE_BACKEND)
 #if defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "util/arch/x86/cpuid_inline.h"
 #elif defined(ARCH_AARCH64)
 #include "util/arch/arm/cpuid_inline.h"
 #endif
+#endif
 
 HS_PUBLIC_API
 hs_error_t HS_CDECL hs_valid_platform(void) {
     /* Hyperscan requires SSSE3, anything else is a bonus */
-#if defined(ARCH_IA32) || defined(ARCH_X86_64)
+#if !defined(VS_SIMDE_BACKEND) && (defined(ARCH_IA32) || defined(ARCH_X86_64))
     if (check_ssse3()) {
         return HS_SUCCESS;
     } else {
         return HS_ARCH_ERROR;
     }
-#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
+#elif !defined(VS_SIMDE_BACKEND) && (defined(ARCH_ARM32) || defined(ARCH_AARCH64))
    if (check_neon()) {
         return HS_SUCCESS;
     } else {
         return HS_ARCH_ERROR;
     }
-#elif defined(ARCH_PPC64EL)
-    return HS_SUCCESS;    
+#elif defined(ARCH_PPC64EL) || defined(VS_SIMDE_BACKEND)
+    return HS_SUCCESS;
 #endif
 }
diff --git a/src/nfa/shufti_simd.hpp b/src/nfa/shufti_simd.hpp
index 0f8e2a7b..feeb54ab 100644
--- a/src/nfa/shufti_simd.hpp
+++ b/src/nfa/shufti_simd.hpp
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2015-2017, Intel Corporation
- * Copyright (c) 2020-2021, VectorCamp PC
+ * Copyright (c) 2020-2023, VectorCamp PC
  * Copyright (c) 2021, Arm Limited
  *
  * Redistribution and use in source and binary forms, with or without
@@ -52,13 +52,17 @@ template <uint16_t S>
 static really_inline
 SuperVector<S> blockDoubleMask(SuperVector<S> mask1_lo, SuperVector<S> mask1_hi, SuperVector<S> mask2_lo, SuperVector<S> mask2_hi, SuperVector<S> chars);
 
+#if defined(VS_SIMDE_BACKEND)
+#include "x86/shufti.hpp"
+#else
 #if defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "x86/shufti.hpp"
-#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
+#elif (defined(ARCH_ARM32) || defined(ARCH_AARCH64))
 #include "arm/shufti.hpp"
 #elif defined(ARCH_PPC64EL)
 #include "ppc64el/shufti.hpp"
 #endif
+#endif
 
 template <uint16_t S>
 static really_inline
diff --git a/src/nfa/truffle_simd.hpp b/src/nfa/truffle_simd.hpp
index e07e92f6..c1028156 100644
--- a/src/nfa/truffle_simd.hpp
+++ b/src/nfa/truffle_simd.hpp
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2015-2017, Intel Corporation
- * Copyright (c) 2020-2021, VectorCamp PC
+ * Copyright (c) 2020-2023, VectorCamp PC
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -45,6 +45,9 @@ template <uint16_t S>
 static really_inline
 const SuperVector<S> blockSingleMask(SuperVector<S> shuf_mask_lo_highclear, SuperVector<S> shuf_mask_lo_highset, SuperVector<S> chars);
 
+#if defined(VS_SIMDE_BACKEND)
+#include "x86/truffle.hpp"
+#else
 #if defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "x86/truffle.hpp"
 #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
@@ -52,6 +55,7 @@ const SuperVector<S> blockSingleMask(SuperVector<S> shuf_mask_lo_highclear, Supe
 #elif defined(ARCH_PPC64EL)
 #include "ppc64el/truffle.hpp"
 #endif
+#endif
 
 template <uint16_t S>
 static really_inline
diff --git a/src/nfa/vermicelli_simd.cpp b/src/nfa/vermicelli_simd.cpp
index a0da0719..67ac1dac 100644
--- a/src/nfa/vermicelli_simd.cpp
+++ b/src/nfa/vermicelli_simd.cpp
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2015-2020, Intel Corporation
- * Copyright (c) 2020-2021, VectorCamp PC
+ * Copyright (c) 2020-2023, VectorCamp PC
  * Copyright (c) 2021, Arm Limited
  *
  * Redistribution and use in source and binary forms, with or without
@@ -71,6 +71,9 @@ const u8 *vermicelliDoubleMaskedBlock(SuperVector<S> const data, SuperVector<S>
                                       SuperVector<S> const mask1, SuperVector<S> const mask2,
                                       u8 const c1, u8 const c2, u8 const m1, u8 const m2, u8 const *buf, u16 const len);
 
+#if defined(VS_SIMDE_BACKEND)
+#include "x86/vermicelli.hpp"
+#else
 #if defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "x86/vermicelli.hpp"
 #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
@@ -78,6 +81,7 @@ const u8 *vermicelliDoubleMaskedBlock(SuperVector<S> const data, SuperVector<S>
 #elif defined(ARCH_PPC64EL)
 #include "ppc64el/vermicelli.hpp"
 #endif
+#endif
 
 template <uint16_t S>
 static const u8 *vermicelliExecReal(SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u8 const *buf_end) {
diff --git a/src/util/arch/common/bitutils.h b/src/util/arch/common/bitutils.h
index e5ff5bc1..e5ab0d05 100644
--- a/src/util/arch/common/bitutils.h
+++ b/src/util/arch/common/bitutils.h
@@ -214,16 +214,22 @@ u64a compress64_impl_c(u64a x, u64a m) {
 }
 
 static really_inline
-m128 compress128_impl_c(m128 xvec, m128 mvec) {
-    u64a ALIGN_ATTR(16) x[2];
-    u64a ALIGN_ATTR(16) m[2];
-    store128(x, xvec);
-    store128(m, mvec);
+m128 compress128_impl_c(m128 x, m128 m) {
+    m128 one = set1_2x64(1);
+    m128 bitset = one;
+    m128 vres = zeroes128();
+    while (isnonzero128(m)) {
+	m128 mm = sub_2x64(zeroes128(), m);
+	m128 tv = and128(x, m);
+	tv = and128(tv, mm);
 
-    compress64_impl_c(x[0], m[0]);
-    compress64_impl_c(x[1], m[1]);
-
-    return xvec;
+	m128 mask = not128(eq64_m128(tv, zeroes128()));
+	mask = and128(bitset, mask);
+        vres = or128(vres, mask);
+	m = and128(m, sub_2x64(m, one));
+        bitset = lshift64_m128(bitset, 1);
+    }
+    return vres;
 }
 
 static really_inline
@@ -303,16 +309,20 @@ u64a expand64_impl_c(u64a x, u64a m) {
 }
 
 static really_inline
-m128 expand128_impl_c(m128 xvec, m128 mvec) {
-    u64a ALIGN_ATTR(16) x[2];
-    u64a ALIGN_ATTR(16) m[2];
-    store128(x, xvec);
-    store128(m, mvec);
-
-    expand64_impl_c(x[0], m[0]);
-    expand64_impl_c(x[1], m[1]);
-
-    return xvec;
+m128 expand128_impl_c(m128 x, m128 m) {
+    m128 one = set1_2x64(1);
+    m128 bb = one;
+    m128 res = zeroes128();
+    while (isnonzero128(m)) {
+	m128 xm = and128(x, bb);
+        m128 mm = sub_2x64(zeroes128(), m);
+        m128 mask = not128(eq64_m128(xm, zeroes128()));
+	mask = and128(mask, and128(m,mm));
+        res = or128(res, mask);
+        m = and128(m, sub_2x64(m, one));
+        bb = lshift64_m128(bb, 1);
+    }
+    return res;
 }
 
 /* returns the first set bit after begin (if not ~0U). If no bit is set after
diff --git a/src/util/arch/common/simd_utils.h b/src/util/arch/common/simd_utils.h
index d142ee9a..24331b10 100644
--- a/src/util/arch/common/simd_utils.h
+++ b/src/util/arch/common/simd_utils.h
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2015-2020, Intel Corporation
- * Copyright (c) 2020-2021, VectorCamp PC
+ * Copyright (c) 2020-2023, VectorCamp PC
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -41,7 +41,7 @@
 
 #include <string.h> // for memcpy
 
-#if !defined(HAVE_SIMD_128_BITS)
+#if !defined(HAVE_SIMD_128_BITS) && !defined(VS_SIMDE_BACKEND)
 #error "You need at least a 128-bit capable SIMD engine!"
 #endif // HAVE_SIMD_128_BITS
 
@@ -88,7 +88,7 @@ static inline void print_m128_2x64(const char *label, m128 vec) {
 #define print_m128_2x64(label, vec) ;
 #endif
 
-#if !defined(ARCH_IA32) && !defined(ARCH_X86_64)
+#if !defined(ARCH_IA32) && !defined(ARCH_X86_64) && !defined(VS_SIMDE_BACKEND)
 #define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0
 #define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0
 #define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8
@@ -455,7 +455,6 @@ static really_inline int isnonzero384(m384 a) {
     return isnonzero128(or128(or128(a.lo, a.mid), a.hi));
 }
 
-#if defined(HAVE_SIMD_128_BITS) && !defined(ARCH_IA32) && !defined(ARCH_X86_64)
 /**
  * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit
  * mask indicating which 32-bit words contain differences.
@@ -464,7 +463,6 @@ static really_inline
 u32 diffrich384(m384 a, m384 b) {
     return diffrich128(a.lo, b.lo) | (diffrich128(a.mid, b.mid) << 4) | (diffrich128(a.hi, b.hi) << 8);
 }
-#endif
 
 /**
  * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and
diff --git a/src/util/arch/simde/cpuid_flags.c b/src/util/arch/simde/cpuid_flags.c
new file mode 100644
index 00000000..a2f3758c
--- /dev/null
+++ b/src/util/arch/simde/cpuid_flags.c
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2023, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "util/arch/common/cpuid_flags.h"
+#include "ue2common.h"
+#include "hs_compile.h" // for HS_MODE_ flags
+#include "util/arch.h"
+
+u64a cpuid_flags(void) {
+     return 0;
+}
+
+u32 cpuid_tune(void) {
+    return HS_TUNE_FAMILY_GENERIC;
+}
diff --git a/src/util/arch/x86/simd_utils.h b/src/util/arch/x86/simd_utils.h
index ba2bf26f..49797aba 100644
--- a/src/util/arch/x86/simd_utils.h
+++ b/src/util/arch/x86/simd_utils.h
@@ -112,6 +112,16 @@ static really_inline u32 diffrich64_128(m128 a, m128 b) {
 #endif
 }
 
+static really_really_inline
+m128 add_2x64(m128 a, m128 b) {
+    return (m128) _mm_add_epi64(a, b);
+}
+
+static really_really_inline
+m128 sub_2x64(m128 a, m128 b) {
+    return (m128) _mm_sub_epi64(a, b);
+}
+
 static really_really_inline
 m128 lshift64_m128(m128 a, unsigned b) {
 #if defined(HAVE__BUILTIN_CONSTANT_P)
@@ -124,8 +134,9 @@ m128 lshift64_m128(m128 a, unsigned b) {
 }
 
 #define rshift64_m128(a, b) _mm_srli_epi64((a), (b))
-#define eq128(a, b)      _mm_cmpeq_epi8((a), (b))
-#define movemask128(a)  ((u32)_mm_movemask_epi8((a)))
+#define eq128(a, b)         _mm_cmpeq_epi8((a), (b))
+#define eq64_m128(a, b)     _mm_cmpeq_epi64((a), (b))
+#define movemask128(a)      ((u32)_mm_movemask_epi8((a)))
 
 #if defined(HAVE_AVX512)
 static really_inline m128 cast512to128(const m512 in) {
@@ -169,7 +180,7 @@ m128 load_m128_from_u64a(const u64a *p) {
 
 static really_inline
 m128 rshiftbyte_m128(const m128 a, int count_immed) {
-#if defined(HAVE__BUILTIN_CONSTANT_P)
+#if defined(HAVE__BUILTIN_CONSTANT_P) && !defined(VS_SIMDE_BACKEND)
     if (__builtin_constant_p(count_immed)) {
         return _mm_srli_si128(a, count_immed);
     }
@@ -200,7 +211,7 @@ m128 rshiftbyte_m128(const m128 a, int count_immed) {
 
 static really_inline
 m128 lshiftbyte_m128(const m128 a, int count_immed) {
-#if defined(HAVE__BUILTIN_CONSTANT_P)
+#if defined(HAVE__BUILTIN_CONSTANT_P) && !defined(VS_SIMDE_BACKEND)
     if (__builtin_constant_p(count_immed)) {
         return _mm_slli_si128(a, count_immed);
     }
@@ -668,24 +679,6 @@ m256 combine2x128(m128 hi, m128 lo) {
 }
 #endif //AVX2
 
-#if defined(HAVE_SIMD_128_BITS)
-/**
- * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit
- * mask indicating which 32-bit words contain differences.
- */
-
-static really_inline u32 diffrich384(m384 a, m384 b) {
-    m128 z = zeroes128();
-    a.lo = _mm_cmpeq_epi32(a.lo, b.lo);
-    a.mid = _mm_cmpeq_epi32(a.mid, b.mid);
-    a.hi = _mm_cmpeq_epi32(a.hi, b.hi);
-    m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.mid),
-                                  _mm_packs_epi32(a.hi, z));
-    return ~(_mm_movemask_epi8(packed)) & 0xfff;
-}
-
-#endif // HAVE_SIMD_128_BITS
-
 /****
  **** 512-bit Primitives
  ****/
diff --git a/src/util/bitfield.h b/src/util/bitfield.h
index 202232b6..4a3fbd6e 100644
--- a/src/util/bitfield.h
+++ b/src/util/bitfield.h
@@ -138,8 +138,8 @@ public:
 
     /// Flip all bits.
     void flip() {
-        for (auto &e : bits) {
-            e = ~e;
+        for (size_t i = 0; i < size(); i++) {
+            flip(i);
         }
         clear_trailer();
     }
diff --git a/src/util/bitutils.h b/src/util/bitutils.h
index ffc8f45d..c67d5a85 100644
--- a/src/util/bitutils.h
+++ b/src/util/bitutils.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2023, VectorCamp PC
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -44,7 +45,7 @@
 #define DOUBLE_CASE_CLEAR 0xdfdf
 #define OCTO_CASE_CLEAR   0xdfdfdfdfdfdfdfdfULL
 
-
+#if !defined(VS_SIMDE_BACKEND)
 #if defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "util/arch/x86/bitutils.h"
 #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
@@ -52,6 +53,32 @@
 #elif defined(ARCH_PPC64EL)
 #include "util/arch/ppc64el/bitutils.h"
 #endif
+#else
+#include "util/arch/common/bitutils.h"
+#define clz32_impl clz32_impl_c
+#define clz64_impl clz64_impl_c
+#define ctz32_impl ctz32_impl_c
+#define ctz64_impl ctz64_impl_c
+#define lg2_impl lg2_impl_c
+#define lg2_64_impl lg2_64_impl_c
+#define findAndClearLSB_32_impl findAndClearLSB_32_impl_c
+#define findAndClearLSB_64_impl findAndClearLSB_64_impl_c
+#define findAndClearMSB_32_impl findAndClearMSB_32_impl_c
+#define findAndClearMSB_64_impl findAndClearMSB_64_impl_c
+#define compress32_impl compress32_impl_c
+#define compress64_impl compress64_impl_c
+#define compress128_impl compress128_impl_c
+#define expand32_impl expand32_impl_c
+#define expand64_impl expand64_impl_c
+#define expand128_impl expand128_impl_c
+#define bf64_iterate_impl bf64_iterate_impl_c
+#define bf64_set_impl bf64_set_impl_c
+#define bf64_unset_impl bf64_unset_impl_c
+#define rank_in_mask32_impl rank_in_mask32_impl_c
+#define rank_in_mask64_impl rank_in_mask64_impl_c
+#define pext32_impl pext32_impl_c
+#define pext64_impl pext64_impl_c
+#endif
 
 static really_inline
 u32 clz32(u32 x) {
diff --git a/src/util/intrinsics.h b/src/util/intrinsics.h
index 08eb6ba6..64f9e9ba 100644
--- a/src/util/intrinsics.h
+++ b/src/util/intrinsics.h
@@ -74,8 +74,6 @@
 #  endif
 #elif defined(USE_PPC64EL_ALTIVEC_H)
 #include <altivec.h>
-#else
-#error no intrinsics file
 #endif
 
 #endif // INTRINSICS_H
diff --git a/src/util/match.hpp b/src/util/match.hpp
index 003c665f..6567b212 100644
--- a/src/util/match.hpp
+++ b/src/util/match.hpp
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2015-2017, Intel Corporation
- * Copyright (c) 2020-2021, VectorCamp PC
+ * Copyright (c) 2020-2023, VectorCamp PC
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -49,6 +49,9 @@ const u8 *first_zero_match_inverted(const u8 *buf, SuperVector<S> v, u16 const l
 template <u16 S>
 const u8 *last_zero_match_inverted(const u8 *buf, SuperVector<S> v, u16 len = S);
 
+#if defined(VS_SIMDE_BACKEND)
+#include "util/arch/x86/match.hpp"
+#else
 #if defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "util/arch/x86/match.hpp"
 #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
@@ -56,5 +59,6 @@ const u8 *last_zero_match_inverted(const u8 *buf, SuperVector<S> v, u16 len = S)
 #elif defined(ARCH_PPC64EL)
 #include "util/arch/ppc64el/match.hpp"
 #endif
+#endif
 
 #endif // MATCH_HPP
diff --git a/src/util/simd_types.h b/src/util/simd_types.h
index 4f0fd1a9..e393d081 100644
--- a/src/util/simd_types.h
+++ b/src/util/simd_types.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2023, VectorCamp PC
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,7 +35,16 @@
 #include "util/intrinsics.h"
 #include "ue2common.h"
 
-#if defined(ARCH_IA32) || defined(ARCH_X86_64)
+#if defined(VS_SIMDE_BACKEND)
+#define VECTORSIZE 16
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#if !defined(VS_SIMDE_NATIVE)
+#define SIMDE_NO_NATIVE
+#endif
+#include <simde/x86/sse4.2.h>
+typedef simde__m128i m128;
+#define HAVE_SIMD_128_BITS
+#elif defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "util/arch/x86/simd_types.h"
 #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
 #include "util/arch/arm/simd_types.h"
@@ -42,9 +52,6 @@
 #include "util/arch/ppc64el/simd_types.h"
 #endif
 
-#if !defined(m128) && !defined(HAVE_SIMD_128_BITS)
-typedef struct ALIGN_DIRECTIVE {u64a hi; u64a lo;} m128;
-#endif
 
 #if !defined(m256) && !defined(HAVE_SIMD_256_BITS)
 typedef struct ALIGN_AVX_DIRECTIVE {m128 lo; m128 hi;} m256;
diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h
index 2f0012c6..01c309b1 100644
--- a/src/util/simd_utils.h
+++ b/src/util/simd_utils.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2020, Intel Corporation
+ * Copyright (c) 2023, VectorCamp PC
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -61,6 +62,9 @@ extern const char vbs_mask_data[];
 }
 #endif
 
+#if defined(VS_SIMDE_BACKEND)
+#include "util/arch/x86/simd_utils.h"
+#else
 #if defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "util/arch/x86/simd_utils.h"
 #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
@@ -68,6 +72,7 @@ extern const char vbs_mask_data[];
 #elif defined(ARCH_PPC64EL)
 #include "util/arch/ppc64el/simd_utils.h"
 #endif
+#endif
 
 #include "util/arch/common/simd_utils.h"
 
diff --git a/src/util/supervector/arch/x86/impl.cpp b/src/util/supervector/arch/x86/impl.cpp
index 3d232e49..b8a75c95 100644
--- a/src/util/supervector/arch/x86/impl.cpp
+++ b/src/util/supervector/arch/x86/impl.cpp
@@ -376,7 +376,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const
 template <>
 really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const
 {
-#if defined(HAVE__BUILTIN_CONSTANT_P)
+#if defined(HAVE__BUILTIN_CONSTANT_P) && !defined(VS_SIMDE_BACKEND)
     if (__builtin_constant_p(N)) {
         return {_mm_slli_si128(u.v128[0], N)};
     }
@@ -451,7 +451,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const
 template <>
 really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const N) const
 {
-#if defined(HAVE__BUILTIN_CONSTANT_P)
+#if defined(HAVE__BUILTIN_CONSTANT_P) && !defined(VS_SIMDE_BACKEND)
     if (__builtin_constant_p(N)) {
         return {_mm_srli_si128(u.v128[0], N)};
     }
@@ -472,7 +472,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr(uint8_t const N) const
 template <>
 really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
 {
-#if defined(HAVE__BUILTIN_CONSTANT_P)
+#if defined(HAVE__BUILTIN_CONSTANT_P) && !defined(VS_SIMDE_BACKEND)
     if (__builtin_constant_p(N)) {
         return {_mm_srli_si128(u.v128[0], N)};
     }
@@ -483,7 +483,7 @@ really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
 template <>
 really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
 {
-#if defined(HAVE__BUILTIN_CONSTANT_P)
+#if defined(HAVE__BUILTIN_CONSTANT_P) && !defined(VS_SIMDE_BACKEND)
     if (__builtin_constant_p(N)) {
         return {_mm_slli_si128(u.v128[0], N)};
     }
diff --git a/src/util/supervector/supervector.hpp b/src/util/supervector/supervector.hpp
index c0200575..253907fa 100644
--- a/src/util/supervector/supervector.hpp
+++ b/src/util/supervector/supervector.hpp
@@ -34,6 +34,9 @@
 #include <cstdio>
 #include <type_traits>
 
+#if defined(VS_SIMDE_BACKEND)
+#include "util/supervector/arch/x86/types.hpp"
+#else
 #if defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "util/supervector/arch/x86/types.hpp"
 #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
@@ -41,6 +44,7 @@
 #elif defined(ARCH_PPC64EL)
 #include "util/supervector/arch/ppc64el/types.hpp"
 #endif
+#endif // VS_SIMDE_BACKEND
 
 #if defined(HAVE_SIMD_512_BITS)
 using Z_TYPE = u64a;
@@ -57,7 +61,7 @@ using Z_TYPE = u32;
 #define DOUBLE_LOAD_MASK(l)        (((1ULL) << (l)) - 1ULL)
 #define SINGLE_LOAD_MASK(l)        (((1ULL) << (l)) - 1ULL)
 #elif defined(HAVE_SIMD_128_BITS)
-#if defined(ARCH_ARM32) || defined(ARCH_AARCH64)
+#if !defined(VS_SIMDE_BACKEND) && (defined(ARCH_ARM32) || defined(ARCH_AARCH64))
 using Z_TYPE = u64a;
 #define Z_BITS 64
 #define Z_POSSHIFT 2
@@ -175,7 +179,7 @@ public:
     typename BaseVector<32>::type ALIGN_ATTR(BaseVector<32>::size) v256[SIZE / BaseVector<32>::size];
     typename BaseVector<64>::type ALIGN_ATTR(BaseVector<64>::size) v512[SIZE / BaseVector<64>::size];
 
-#if defined(ARCH_ARM32) || defined(ARCH_AARCH64) || defined(ARCH_PPC64EL)
+#if !defined(VS_SIMDE_BACKEND) && (defined(ARCH_ARM32) || defined(ARCH_AARCH64) || defined(ARCH_PPC64EL))
     uint64x2_t ALIGN_ATTR(BaseVector<16>::size) u64x2[SIZE / BaseVector<16>::size];
     int64x2_t ALIGN_ATTR(BaseVector<16>::size) s64x2[SIZE / BaseVector<16>::size];
     uint32x4_t ALIGN_ATTR(BaseVector<16>::size) u32x4[SIZE / BaseVector<16>::size];
@@ -382,6 +386,9 @@ struct Unroller<End, End>
 };
 
 #if defined(HS_OPTIMIZE)
+#if defined(VS_SIMDE_BACKEND)
+#include "util/supervector/arch/x86/impl.cpp"
+#else
 #if defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "util/supervector/arch/x86/impl.cpp"
 #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
@@ -390,6 +397,7 @@ struct Unroller<End, End>
 #include "util/supervector/arch/ppc64el/impl.cpp"
 #endif
 #endif
+#endif
 
 #endif /* SUPERVECTOR_H */
 
diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp
index c57cd598..272d5456 100644
--- a/unit/internal/simd_utils.cpp
+++ b/unit/internal/simd_utils.cpp
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2023, VectorCamp PC
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -667,7 +668,7 @@ TEST(SimdUtilsTest, movq) {
     ASSERT_EQ(0, memcmp(cmp, &r, sizeof(r)));
 
 #if defined(HAVE_SIMD_128_BITS)
-#if defined(ARCH_IA32) || defined(ARCH_X86_64)
+#if defined(ARCH_IA32) || defined(ARCH_X86_64) || defined(VS_SIMDE_BACKEND)
     simd = _mm_set_epi64x(~0LL, 0x123456789abcdef);
 #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
     int64x2_t a = { 0x123456789abcdefLL, ~0LL };