Merge pull request #203 from VectorCamp/feature/enable-simde-backend

Feature/enable simde backend
2025-11-17 01:41:51 +03:00 · 2023-11-29 11:22:08 +02:00
parent 44b893abfc 519bd64c65
commit a26bed96bc
24 changed files with 242 additions and 83 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "simde"]
+	path = simde
+	url = https://github.com/simd-everywhere/simde.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -119,15 +119,22 @@ endif()
 # Detect OS and if Fat Runtime is available
 include (${CMAKE_MODULE_PATH}/osdetection.cmake)

-if (ARCH_IA32 OR ARCH_X86_64)
+if(SIMDE_BACKEND)
+    include (${CMAKE_MODULE_PATH}/simde.cmake)
+elseif (ARCH_IA32 OR ARCH_X86_64)
    include (${CMAKE_MODULE_PATH}/cflags-x86.cmake)
-    set(ARCH_FLAG march)
 elseif (ARCH_ARM32 OR ARCH_AARCH64)
    include (${CMAKE_MODULE_PATH}/cflags-arm.cmake)
-    set(ARCH_FLAG march)
 elseif (ARCH_PPC64EL)
    include (${CMAKE_MODULE_PATH}/cflags-ppc64le.cmake)
+else ()
+    message(FATAL_ERROR "Unsupported platform")
+endif ()
+
+if (ARCH_PPC64EL)
    set(ARCH_FLAG mcpu)
+else ()
+    set(ARCH_FLAG march)
 endif ()

 # Detect Native arch flags if requested
@@ -239,8 +246,11 @@ set (hs_exec_common_SRCS
    src/util/arch/common/cpuid_flags.h
    src/util/multibit.c
    )
-
-if (ARCH_IA32 OR ARCH_X86_64)
+if (SIMDE_BACKEND)
+set (hs_exec_common_SRCS
+    ${hs_exec_common_SRCS}
+    src/util/arch/simde/cpuid_flags.c)
+elseif (ARCH_IA32 OR ARCH_X86_64)
 set (hs_exec_common_SRCS
    ${hs_exec_common_SRCS}
    src/util/arch/x86/cpuid_flags.c
@@ -398,7 +408,12 @@ set (hs_exec_SRCS
    src/database.h
 )

-if (ARCH_IA32 OR ARCH_X86_64)
+if (SIMDE_BACKEND)
+set (hs_exec_SRCS
+    ${hs_exec_SRCS}
+    src/nfa/vermicelli_simd.cpp
+    src/util/supervector/arch/x86/impl.cpp)
+elseif (ARCH_IA32 OR ARCH_X86_64)
 set (hs_exec_SRCS
    ${hs_exec_SRCS}
    src/nfa/vermicelli_simd.cpp
--- a/README.md
+++ b/README.md
@@ -1,8 +1,12 @@
 # About Vectorscan

 A fork of Intel's Hyperscan, modified to run on more platforms. Currently ARM NEON/ASIMD
-is 100% functional, and Power VSX are in development. ARM SVE2 support is in ongoing with
+and Power VSX are 100% functional. ARM SVE2 support is in ongoing with
 access to hardware now. More platforms will follow in the future.
+Further more, starting 5.4.12 there is now a [SIMDe](https://github.com/simd-everywhere/simde)
+port, which can be either used for platforms without official SIMD support,
+as SIMDe can emulate SIMD instructions, or as an alternative backend for existing architectures,
+for reference and comparison purposes.

 Vectorscan will follow Intel's API and internal algorithms where possible, but will not
 hesitate to make code changes where it is thought of giving better performance or better
@@ -148,6 +152,11 @@ Common options for Cmake are:

 * `SANITIZE=[address|memory|undefined]` (experimental) Use `libasan` sanitizer to detect possible bugs. For now only `address` is tested. This will eventually be integrated in the CI.

+## SIMDe options
+
+* `SIMDE_BACKEND=[On|Off]` Enable SIMDe backend. If this is chosen all native (SSE/AVX/AVX512/Neon/SVE/VSX) backends will be disabled and a SIMDe SSE4.2 emulation backend will be enabled. This will enable Vectorscan to build and run on architectures without SIMD.
+* `SIMDE_NATIVE=[On|Off]` Enable SIMDe native emulation of x86 SSE4.2 intrinsics on the building platform. That is, SSE4.2 intrinsics will be emulated using Neon on an Arm platform, or VSX on a Power platform, etc.
+
 ## Build

 If `cmake` has completed successfully you can run `make` in the same directory, if you have a multi-core system with `N` cores, running
@@ -177,4 +186,4 @@ the [Developer Reference Guide](http://intel.github.io/hyperscan/dev-reference/)

 And you can find the source code [on Github](https://github.com/intel/hyperscan).

-For Intel Hyperscan related issues and questions, please follow the relevant links there.
+For Intel Hyperscan related issues and questions, please follow the relevant links there.
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -1,4 +1,7 @@
-if (NOT FAT_RUNTIME AND (BUILD_STATIC_AND_SHARED OR BUILD_STATIC_LIBS))
+include_directories(SYSTEM ${CMAKE_CURRENT_SOURCE_DIR})
+include_directories(${PROJECT_SOURCE_DIR})
+
+if (NOT FAT_RUNTIME AND (BUILD_SHARED_LIBS OR BUILD_STATIC_LIBS))
  add_executable(benchmarks benchmarks.cpp)
  set_source_files_properties(benchmarks.cpp PROPERTIES COMPILE_FLAGS
      "-Wall -Wno-unused-variable")
--- a/cmake/archdetect.cmake
+++ b/cmake/archdetect.cmake
@@ -67,7 +67,10 @@ if (USE_CPU_NATIVE)
        message(STATUS "clang will tune for ${GNUCC_ARCH}, ${TUNE_FLAG}")
    endif()
 else()
-    if (ARCH_IA32 OR ARCH_X86_64)
+    if (SIMDE_BACKEND)
+        set(GNUCC_ARCH native)
+        set(TUNE_FLAG native)
+    elseif (ARCH_IA32 OR ARCH_X86_64)
        set(GNUCC_ARCH native)
        set(TUNE_FLAG generic)
    elseif(ARCH_AARCH64)
@@ -84,8 +87,11 @@ else()
    elseif(ARCH_ARM32)
       set(GNUCC_ARCH armv7a)
       set(TUNE_FLAG generic)
+    elseif(ARCH_PPC64EL)
+       set(GNUCC_ARCH power8)
+       set(TUNE_FLAG power8)
    else()
-       set(GNUCC_ARCH power9)
-       set(TUNE_FLAG power9)
+       set(GNUCC_ARCH native)
+       set(TUNE_FLAG native)
    endif()
 endif()
--- a/cmake/simde.cmake
+++ b/cmake/simde.cmake
@@ -0,0 +1,15 @@
+include_directories(${PROJECT_SOURCE_DIR}/simde/simde)
+
+CHECK_INCLUDE_FILES("simde/x86/sse4.2.h" SIMDE_SSE42_H_FOUND)
+
+if (SIMDE_SSE42_H_FOUND)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DVS_SIMDE_BACKEND")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DVS_SIMDE_BACKEND")
+
+  if (SIMDE_NATIVE)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DVS_SIMDE_NATIVE -DSIMDE_ENABLE_OPENMP -fopenmp-simd")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DVS_SIMDE_NATIVE -DSIMDE_ENABLE_OPENMP -fopenmp-simd")
+  endif()
+else()
+  message(FATAL_ERROR "SIMDe backend requested but SIMDe is not available on the system")
+endif()
--- a/1
+++ b/1
--- a/src/hs_valid_platform.c
+++ b/src/hs_valid_platform.c
@@ -1,5 +1,6 @@
 /*
 * Copyright (c) 2016-2017, Intel Corporation
+ * Copyright (c) 2020-2023, VectorCamp PC
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -29,28 +30,30 @@
 #include "config.h"
 #include "hs_common.h"
 #include "ue2common.h"
+#if !defined(VS_SIMDE_BACKEND)
 #if defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "util/arch/x86/cpuid_inline.h"
 #elif defined(ARCH_AARCH64)
 #include "util/arch/arm/cpuid_inline.h"
 #endif
+#endif

 HS_PUBLIC_API
 hs_error_t HS_CDECL hs_valid_platform(void) {
    /* Hyperscan requires SSSE3, anything else is a bonus */
-#if defined(ARCH_IA32) || defined(ARCH_X86_64)
+#if !defined(VS_SIMDE_BACKEND) && (defined(ARCH_IA32) || defined(ARCH_X86_64))
    if (check_ssse3()) {
        return HS_SUCCESS;
    } else {
        return HS_ARCH_ERROR;
    }
-#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
+#elif !defined(VS_SIMDE_BACKEND) && (defined(ARCH_ARM32) || defined(ARCH_AARCH64))
   if (check_neon()) {
        return HS_SUCCESS;
    } else {
        return HS_ARCH_ERROR;
    }
-#elif defined(ARCH_PPC64EL)
-    return HS_SUCCESS;    
+#elif defined(ARCH_PPC64EL) || defined(VS_SIMDE_BACKEND)
+    return HS_SUCCESS;
 #endif
 }
--- a/src/nfa/shufti_simd.hpp
+++ b/src/nfa/shufti_simd.hpp
@@ -1,6 +1,6 @@
 /*
 * Copyright (c) 2015-2017, Intel Corporation
- * Copyright (c) 2020-2021, VectorCamp PC
+ * Copyright (c) 2020-2023, VectorCamp PC
 * Copyright (c) 2021, Arm Limited
 *
 * Redistribution and use in source and binary forms, with or without
@@ -52,13 +52,17 @@ template <uint16_t S>
 static really_inline
 SuperVector<S> blockDoubleMask(SuperVector<S> mask1_lo, SuperVector<S> mask1_hi, SuperVector<S> mask2_lo, SuperVector<S> mask2_hi, SuperVector<S> chars);

+#if defined(VS_SIMDE_BACKEND)
+#include "x86/shufti.hpp"
+#else
 #if defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "x86/shufti.hpp"
-#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
+#elif (defined(ARCH_ARM32) || defined(ARCH_AARCH64))
 #include "arm/shufti.hpp"
 #elif defined(ARCH_PPC64EL)
 #include "ppc64el/shufti.hpp"
 #endif
+#endif

 template <uint16_t S>
 static really_inline
--- a/src/nfa/truffle_simd.hpp
+++ b/src/nfa/truffle_simd.hpp
@@ -1,6 +1,6 @@
 /*
 * Copyright (c) 2015-2017, Intel Corporation
- * Copyright (c) 2020-2021, VectorCamp PC
+ * Copyright (c) 2020-2023, VectorCamp PC
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -45,6 +45,9 @@ template <uint16_t S>
 static really_inline
 const SuperVector<S> blockSingleMask(SuperVector<S> shuf_mask_lo_highclear, SuperVector<S> shuf_mask_lo_highset, SuperVector<S> chars);

+#if defined(VS_SIMDE_BACKEND)
+#include "x86/truffle.hpp"
+#else
 #if defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "x86/truffle.hpp"
 #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
@@ -52,6 +55,7 @@ const SuperVector<S> blockSingleMask(SuperVector<S> shuf_mask_lo_highclear, Supe
 #elif defined(ARCH_PPC64EL)
 #include "ppc64el/truffle.hpp"
 #endif
+#endif

 template <uint16_t S>
 static really_inline
--- a/src/nfa/vermicelli_simd.cpp
+++ b/src/nfa/vermicelli_simd.cpp
@@ -1,6 +1,6 @@
 /*
 * Copyright (c) 2015-2020, Intel Corporation
- * Copyright (c) 2020-2021, VectorCamp PC
+ * Copyright (c) 2020-2023, VectorCamp PC
 * Copyright (c) 2021, Arm Limited
 *
 * Redistribution and use in source and binary forms, with or without
@@ -71,6 +71,9 @@ const u8 *vermicelliDoubleMaskedBlock(SuperVector<S> const data, SuperVector<S>
                                      SuperVector<S> const mask1, SuperVector<S> const mask2,
                                      u8 const c1, u8 const c2, u8 const m1, u8 const m2, u8 const *buf, u16 const len);

+#if defined(VS_SIMDE_BACKEND)
+#include "x86/vermicelli.hpp"
+#else
 #if defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "x86/vermicelli.hpp"
 #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
@@ -78,6 +81,7 @@ const u8 *vermicelliDoubleMaskedBlock(SuperVector<S> const data, SuperVector<S>
 #elif defined(ARCH_PPC64EL)
 #include "ppc64el/vermicelli.hpp"
 #endif
+#endif

 template <uint16_t S>
 static const u8 *vermicelliExecReal(SuperVector<S> const chars, SuperVector<S> const casemask, u8 const *buf, u8 const *buf_end) {
--- a/src/util/arch/common/bitutils.h
+++ b/src/util/arch/common/bitutils.h
@@ -214,16 +214,22 @@ u64a compress64_impl_c(u64a x, u64a m) {
 }

 static really_inline
-m128 compress128_impl_c(m128 xvec, m128 mvec) {
-    u64a ALIGN_ATTR(16) x[2];
-    u64a ALIGN_ATTR(16) m[2];
-    store128(x, xvec);
-    store128(m, mvec);
+m128 compress128_impl_c(m128 x, m128 m) {
+    m128 one = set1_2x64(1);
+    m128 bitset = one;
+    m128 vres = zeroes128();
+    while (isnonzero128(m)) {
+	m128 mm = sub_2x64(zeroes128(), m);
+	m128 tv = and128(x, m);
+	tv = and128(tv, mm);

-    compress64_impl_c(x[0], m[0]);
-    compress64_impl_c(x[1], m[1]);
-
-    return xvec;
+	m128 mask = not128(eq64_m128(tv, zeroes128()));
+	mask = and128(bitset, mask);
+        vres = or128(vres, mask);
+	m = and128(m, sub_2x64(m, one));
+        bitset = lshift64_m128(bitset, 1);
+    }
+    return vres;
 }

 static really_inline
@@ -303,16 +309,20 @@ u64a expand64_impl_c(u64a x, u64a m) {
 }

 static really_inline
-m128 expand128_impl_c(m128 xvec, m128 mvec) {
-    u64a ALIGN_ATTR(16) x[2];
-    u64a ALIGN_ATTR(16) m[2];
-    store128(x, xvec);
-    store128(m, mvec);
-
-    expand64_impl_c(x[0], m[0]);
-    expand64_impl_c(x[1], m[1]);
-
-    return xvec;
+m128 expand128_impl_c(m128 x, m128 m) {
+    m128 one = set1_2x64(1);
+    m128 bb = one;
+    m128 res = zeroes128();
+    while (isnonzero128(m)) {
+	m128 xm = and128(x, bb);
+        m128 mm = sub_2x64(zeroes128(), m);
+        m128 mask = not128(eq64_m128(xm, zeroes128()));
+	mask = and128(mask, and128(m,mm));
+        res = or128(res, mask);
+        m = and128(m, sub_2x64(m, one));
+        bb = lshift64_m128(bb, 1);
+    }
+    return res;
 }

 /* returns the first set bit after begin (if not ~0U). If no bit is set after
--- a/src/util/arch/common/simd_utils.h
+++ b/src/util/arch/common/simd_utils.h
@@ -1,6 +1,6 @@
 /*
 * Copyright (c) 2015-2020, Intel Corporation
- * Copyright (c) 2020-2021, VectorCamp PC
+ * Copyright (c) 2020-2023, VectorCamp PC
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -41,7 +41,7 @@

 #include <string.h> // for memcpy

-#if !defined(HAVE_SIMD_128_BITS)
+#if !defined(HAVE_SIMD_128_BITS) && !defined(VS_SIMDE_BACKEND)
 #error "You need at least a 128-bit capable SIMD engine!"
 #endif // HAVE_SIMD_128_BITS

@@ -88,7 +88,7 @@ static inline void print_m128_2x64(const char *label, m128 vec) {
 #define print_m128_2x64(label, vec) ;
 #endif

-#if !defined(ARCH_IA32) && !defined(ARCH_X86_64)
+#if !defined(ARCH_IA32) && !defined(ARCH_X86_64) && !defined(VS_SIMDE_BACKEND)
 #define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0
 #define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0
 #define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8
@@ -455,7 +455,6 @@ static really_inline int isnonzero384(m384 a) {
    return isnonzero128(or128(or128(a.lo, a.mid), a.hi));
 }

-#if defined(HAVE_SIMD_128_BITS) && !defined(ARCH_IA32) && !defined(ARCH_X86_64)
 /**
 * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit
 * mask indicating which 32-bit words contain differences.
@@ -464,7 +463,6 @@ static really_inline
 u32 diffrich384(m384 a, m384 b) {
    return diffrich128(a.lo, b.lo) | (diffrich128(a.mid, b.mid) << 4) | (diffrich128(a.hi, b.hi) << 8);
 }
-#endif

 /**
 * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and
--- a/src/util/arch/simde/cpuid_flags.c
+++ b/src/util/arch/simde/cpuid_flags.c
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2023, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "util/arch/common/cpuid_flags.h"
+#include "ue2common.h"
+#include "hs_compile.h" // for HS_MODE_ flags
+#include "util/arch.h"
+
+u64a cpuid_flags(void) {
+     return 0;
+}
+
+u32 cpuid_tune(void) {
+    return HS_TUNE_FAMILY_GENERIC;
+}
--- a/src/util/arch/x86/simd_utils.h
+++ b/src/util/arch/x86/simd_utils.h
@@ -112,6 +112,16 @@ static really_inline u32 diffrich64_128(m128 a, m128 b) {
 #endif
 }

+static really_really_inline
+m128 add_2x64(m128 a, m128 b) {
+    return (m128) _mm_add_epi64(a, b);
+}
+
+static really_really_inline
+m128 sub_2x64(m128 a, m128 b) {
+    return (m128) _mm_sub_epi64(a, b);
+}
+
 static really_really_inline
 m128 lshift64_m128(m128 a, unsigned b) {
 #if defined(HAVE__BUILTIN_CONSTANT_P)
@@ -124,8 +134,9 @@ m128 lshift64_m128(m128 a, unsigned b) {
 }

 #define rshift64_m128(a, b) _mm_srli_epi64((a), (b))
-#define eq128(a, b)      _mm_cmpeq_epi8((a), (b))
-#define movemask128(a)  ((u32)_mm_movemask_epi8((a)))
+#define eq128(a, b)         _mm_cmpeq_epi8((a), (b))
+#define eq64_m128(a, b)     _mm_cmpeq_epi64((a), (b))
+#define movemask128(a)      ((u32)_mm_movemask_epi8((a)))

 #if defined(HAVE_AVX512)
 static really_inline m128 cast512to128(const m512 in) {
@@ -169,7 +180,7 @@ m128 load_m128_from_u64a(const u64a *p) {

 static really_inline
 m128 rshiftbyte_m128(const m128 a, int count_immed) {
-#if defined(HAVE__BUILTIN_CONSTANT_P)
+#if defined(HAVE__BUILTIN_CONSTANT_P) && !defined(VS_SIMDE_BACKEND)
    if (__builtin_constant_p(count_immed)) {
        return _mm_srli_si128(a, count_immed);
    }
@@ -200,7 +211,7 @@ m128 rshiftbyte_m128(const m128 a, int count_immed) {

 static really_inline
 m128 lshiftbyte_m128(const m128 a, int count_immed) {
-#if defined(HAVE__BUILTIN_CONSTANT_P)
+#if defined(HAVE__BUILTIN_CONSTANT_P) && !defined(VS_SIMDE_BACKEND)
    if (__builtin_constant_p(count_immed)) {
        return _mm_slli_si128(a, count_immed);
    }
@@ -668,24 +679,6 @@ m256 combine2x128(m128 hi, m128 lo) {
 }
 #endif //AVX2

-#if defined(HAVE_SIMD_128_BITS)
-/**
- * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit
- * mask indicating which 32-bit words contain differences.
- */
-
-static really_inline u32 diffrich384(m384 a, m384 b) {
-    m128 z = zeroes128();
-    a.lo = _mm_cmpeq_epi32(a.lo, b.lo);
-    a.mid = _mm_cmpeq_epi32(a.mid, b.mid);
-    a.hi = _mm_cmpeq_epi32(a.hi, b.hi);
-    m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.mid),
-                                  _mm_packs_epi32(a.hi, z));
-    return ~(_mm_movemask_epi8(packed)) & 0xfff;
-}
-
-#endif // HAVE_SIMD_128_BITS
-
 /****
 **** 512-bit Primitives
 ****/
--- a/src/util/bitfield.h
+++ b/src/util/bitfield.h
@@ -138,8 +138,8 @@ public:

    /// Flip all bits.
    void flip() {
-        for (auto &e : bits) {
-            e = ~e;
+        for (size_t i = 0; i < size(); i++) {
+            flip(i);
        }
        clear_trailer();
    }
--- a/src/util/bitutils.h
+++ b/src/util/bitutils.h
@@ -1,5 +1,6 @@
 /*
 * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2023, VectorCamp PC
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -44,7 +45,7 @@
 #define DOUBLE_CASE_CLEAR 0xdfdf
 #define OCTO_CASE_CLEAR   0xdfdfdfdfdfdfdfdfULL

-
+#if !defined(VS_SIMDE_BACKEND)
 #if defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "util/arch/x86/bitutils.h"
 #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
@@ -52,6 +53,32 @@
 #elif defined(ARCH_PPC64EL)
 #include "util/arch/ppc64el/bitutils.h"
 #endif
+#else
+#include "util/arch/common/bitutils.h"
+#define clz32_impl clz32_impl_c
+#define clz64_impl clz64_impl_c
+#define ctz32_impl ctz32_impl_c
+#define ctz64_impl ctz64_impl_c
+#define lg2_impl lg2_impl_c
+#define lg2_64_impl lg2_64_impl_c
+#define findAndClearLSB_32_impl findAndClearLSB_32_impl_c
+#define findAndClearLSB_64_impl findAndClearLSB_64_impl_c
+#define findAndClearMSB_32_impl findAndClearMSB_32_impl_c
+#define findAndClearMSB_64_impl findAndClearMSB_64_impl_c
+#define compress32_impl compress32_impl_c
+#define compress64_impl compress64_impl_c
+#define compress128_impl compress128_impl_c
+#define expand32_impl expand32_impl_c
+#define expand64_impl expand64_impl_c
+#define expand128_impl expand128_impl_c
+#define bf64_iterate_impl bf64_iterate_impl_c
+#define bf64_set_impl bf64_set_impl_c
+#define bf64_unset_impl bf64_unset_impl_c
+#define rank_in_mask32_impl rank_in_mask32_impl_c
+#define rank_in_mask64_impl rank_in_mask64_impl_c
+#define pext32_impl pext32_impl_c
+#define pext64_impl pext64_impl_c
+#endif

 static really_inline
 u32 clz32(u32 x) {
--- a/src/util/intrinsics.h
+++ b/src/util/intrinsics.h
@@ -74,8 +74,6 @@
 #  endif
 #elif defined(USE_PPC64EL_ALTIVEC_H)
 #include <altivec.h>
-#else
-#error no intrinsics file
 #endif

 #endif // INTRINSICS_H
--- a/src/util/match.hpp
+++ b/src/util/match.hpp
@@ -1,6 +1,6 @@
 /*
 * Copyright (c) 2015-2017, Intel Corporation
- * Copyright (c) 2020-2021, VectorCamp PC
+ * Copyright (c) 2020-2023, VectorCamp PC
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -49,6 +49,9 @@ const u8 *first_zero_match_inverted(const u8 *buf, SuperVector<S> v, u16 const l
 template <u16 S>
 const u8 *last_zero_match_inverted(const u8 *buf, SuperVector<S> v, u16 len = S);

+#if defined(VS_SIMDE_BACKEND)
+#include "util/arch/x86/match.hpp"
+#else
 #if defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "util/arch/x86/match.hpp"
 #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
@@ -56,5 +59,6 @@ const u8 *last_zero_match_inverted(const u8 *buf, SuperVector<S> v, u16 len = S)
 #elif defined(ARCH_PPC64EL)
 #include "util/arch/ppc64el/match.hpp"
 #endif
+#endif

 #endif // MATCH_HPP
--- a/src/util/simd_types.h
+++ b/src/util/simd_types.h
@@ -1,5 +1,6 @@
 /*
 * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2023, VectorCamp PC
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -34,7 +35,16 @@
 #include "util/intrinsics.h"
 #include "ue2common.h"

-#if defined(ARCH_IA32) || defined(ARCH_X86_64)
+#if defined(VS_SIMDE_BACKEND)
+#define VECTORSIZE 16
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#if !defined(VS_SIMDE_NATIVE)
+#define SIMDE_NO_NATIVE
+#endif
+#include <simde/x86/sse4.2.h>
+typedef simde__m128i m128;
+#define HAVE_SIMD_128_BITS
+#elif defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "util/arch/x86/simd_types.h"
 #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
 #include "util/arch/arm/simd_types.h"
@@ -42,9 +52,6 @@
 #include "util/arch/ppc64el/simd_types.h"
 #endif

-#if !defined(m128) && !defined(HAVE_SIMD_128_BITS)
-typedef struct ALIGN_DIRECTIVE {u64a hi; u64a lo;} m128;
-#endif

 #if !defined(m256) && !defined(HAVE_SIMD_256_BITS)
 typedef struct ALIGN_AVX_DIRECTIVE {m128 lo; m128 hi;} m256;
--- a/src/util/simd_utils.h
+++ b/src/util/simd_utils.h
@@ -1,5 +1,6 @@
 /*
 * Copyright (c) 2015-2020, Intel Corporation
+ * Copyright (c) 2023, VectorCamp PC
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -61,6 +62,9 @@ extern const char vbs_mask_data[];
 }
 #endif

+#if defined(VS_SIMDE_BACKEND)
+#include "util/arch/x86/simd_utils.h"
+#else
 #if defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "util/arch/x86/simd_utils.h"
 #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
@@ -68,6 +72,7 @@ extern const char vbs_mask_data[];
 #elif defined(ARCH_PPC64EL)
 #include "util/arch/ppc64el/simd_utils.h"
 #endif
+#endif

 #include "util/arch/common/simd_utils.h"

--- a/src/util/supervector/arch/x86/impl.cpp
+++ b/src/util/supervector/arch/x86/impl.cpp
@@ -376,7 +376,7 @@ really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const
 template <>
 really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const
 {
-#if defined(HAVE__BUILTIN_CONSTANT_P)
+#if defined(HAVE__BUILTIN_CONSTANT_P) && !defined(VS_SIMDE_BACKEND)
    if (__builtin_constant_p(N)) {
        return {_mm_slli_si128(u.v128[0], N)};
    }
@@ -451,7 +451,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const
 template <>
 really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const N) const
 {
-#if defined(HAVE__BUILTIN_CONSTANT_P)
+#if defined(HAVE__BUILTIN_CONSTANT_P) && !defined(VS_SIMDE_BACKEND)
    if (__builtin_constant_p(N)) {
        return {_mm_srli_si128(u.v128[0], N)};
    }
@@ -472,7 +472,7 @@ really_inline SuperVector<16> SuperVector<16>::vshr(uint8_t const N) const
 template <>
 really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
 {
-#if defined(HAVE__BUILTIN_CONSTANT_P)
+#if defined(HAVE__BUILTIN_CONSTANT_P) && !defined(VS_SIMDE_BACKEND)
    if (__builtin_constant_p(N)) {
        return {_mm_srli_si128(u.v128[0], N)};
    }
@@ -483,7 +483,7 @@ really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
 template <>
 really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
 {
-#if defined(HAVE__BUILTIN_CONSTANT_P)
+#if defined(HAVE__BUILTIN_CONSTANT_P) && !defined(VS_SIMDE_BACKEND)
    if (__builtin_constant_p(N)) {
        return {_mm_slli_si128(u.v128[0], N)};
    }
--- a/src/util/supervector/supervector.hpp
+++ b/src/util/supervector/supervector.hpp
@@ -34,6 +34,9 @@
 #include <cstdio>
 #include <type_traits>

+#if defined(VS_SIMDE_BACKEND)
+#include "util/supervector/arch/x86/types.hpp"
+#else
 #if defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "util/supervector/arch/x86/types.hpp"
 #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
@@ -41,6 +44,7 @@
 #elif defined(ARCH_PPC64EL)
 #include "util/supervector/arch/ppc64el/types.hpp"
 #endif
+#endif // VS_SIMDE_BACKEND

 #if defined(HAVE_SIMD_512_BITS)
 using Z_TYPE = u64a;
@@ -57,7 +61,7 @@ using Z_TYPE = u32;
 #define DOUBLE_LOAD_MASK(l)        (((1ULL) << (l)) - 1ULL)
 #define SINGLE_LOAD_MASK(l)        (((1ULL) << (l)) - 1ULL)
 #elif defined(HAVE_SIMD_128_BITS)
-#if defined(ARCH_ARM32) || defined(ARCH_AARCH64)
+#if !defined(VS_SIMDE_BACKEND) && (defined(ARCH_ARM32) || defined(ARCH_AARCH64))
 using Z_TYPE = u64a;
 #define Z_BITS 64
 #define Z_POSSHIFT 2
@@ -175,7 +179,7 @@ public:
    typename BaseVector<32>::type ALIGN_ATTR(BaseVector<32>::size) v256[SIZE / BaseVector<32>::size];
    typename BaseVector<64>::type ALIGN_ATTR(BaseVector<64>::size) v512[SIZE / BaseVector<64>::size];

-#if defined(ARCH_ARM32) || defined(ARCH_AARCH64) || defined(ARCH_PPC64EL)
+#if !defined(VS_SIMDE_BACKEND) && (defined(ARCH_ARM32) || defined(ARCH_AARCH64) || defined(ARCH_PPC64EL))
    uint64x2_t ALIGN_ATTR(BaseVector<16>::size) u64x2[SIZE / BaseVector<16>::size];
    int64x2_t ALIGN_ATTR(BaseVector<16>::size) s64x2[SIZE / BaseVector<16>::size];
    uint32x4_t ALIGN_ATTR(BaseVector<16>::size) u32x4[SIZE / BaseVector<16>::size];
@@ -382,6 +386,9 @@ struct Unroller<End, End>
 };

 #if defined(HS_OPTIMIZE)
+#if defined(VS_SIMDE_BACKEND)
+#include "util/supervector/arch/x86/impl.cpp"
+#else
 #if defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "util/supervector/arch/x86/impl.cpp"
 #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
@@ -390,6 +397,7 @@ struct Unroller<End, End>
 #include "util/supervector/arch/ppc64el/impl.cpp"
 #endif
 #endif
+#endif

 #endif /* SUPERVECTOR_H */

--- a/unit/internal/simd_utils.cpp
+++ b/unit/internal/simd_utils.cpp
@@ -1,5 +1,6 @@
 /*
 * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2023, VectorCamp PC
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@@ -667,7 +668,7 @@ TEST(SimdUtilsTest, movq) {
    ASSERT_EQ(0, memcmp(cmp, &r, sizeof(r)));

 #if defined(HAVE_SIMD_128_BITS)
-#if defined(ARCH_IA32) || defined(ARCH_X86_64)
+#if defined(ARCH_IA32) || defined(ARCH_X86_64) || defined(VS_SIMDE_BACKEND)
    simd = _mm_set_epi64x(~0LL, 0x123456789abcdef);
 #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
    int64x2_t a = { 0x123456789abcdefLL, ~0LL };