Release 5.4.12 (#341)

Multiple changes since last release, this will be the last 100% ABI and API compatible with Hyperscan release. Next versions will include major refactors and API extensions, it will be mostly backwards compatible however. Without particular order, platform support is now: * Linux (x86, Arm, Power) * FreeBSD 14 (x86, Arm, Power) * MacOS 14+ (x86, Arm) In total more than 200 configurations in the CI are tested for every PR. Other features: - Fat Runtime supported for Arm as well (ASIMD/SVE/SVE2). - Initial implementations for Arm SVE/SVE2 algorithms added, thanks to Yoan Picchi from Arm. - SIMDe support added, used as an alternative backend for existing platforms, but mostly interesting for allowing Vectorscan to build in new platforms without a supported SIMD engine. - Various speedups and optimizations. - Cppcheck and clang-tidy fixes throughout the code, both have been added to CI for multiple configurations, but only cppcheck triggers a build failure for now. Various bugfixes, most important listed: - Speed up truffle with 256b TBL instructions (#290) - Fix Clang Tidy warnings (#295) - Clang 17+ is more restrictive on rebind<T> on MacOS/Boost, remove warning (#332) - partial_load_u64 will fail if buf == NULL/c_len == 0 (#331) - Bugfix/fix avx512vbmi regressions (#335) - fix missing hs_version.h header (closes #198) - hs_valid_platform: Fix check for SSE4.2 (#310) - Fixed out of bounds read in AVX512VBMI version of fdr_exec_fat_teddy … (#333) - Fix noodle SVE2 off by one bug (#313) - Make vectorscan accept \0 starting pattern (#312) - Fix 5.4.11's config step regression (#327) - Fix double shufti's vector end false positive (#325)
2025-08-13 13:25:58 +03:00 · 2025-07-22 18:09:14 +03:00 · 2025-07-22 18:09:14 +03:00 · b585ad4666
commit b585ad4666
parent d29730e1cb 22b76d11a7
377 changed files with 1397527 additions and 5731 deletions
--- a/.clang-tidy
+++ b/.clang-tidy
@ -0,0 +1,11 @@
+#unit/gtest/gtest-all.cc,build/src/parser/Parser.cpp,build/src/parser/control_verbs.cpp
+#Dont change first comment ignores specific files from clang-tidy
+
+
+Checks:              'clang-analyzer-*,-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,performance-*,-performance-unnecessary-value-param,-performance-avoid-endl'
+WarningsAsErrors:    ''
+HeaderFilterRegex:   '.*'
+SystemHeaders: false
+FormatStyle:         none
+InheritParentConfig: true
+User:                user
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,3 @@
+[submodule "simde"]
+	path = simde
+	url = https://github.com/simd-everywhere/simde.git
--- a/CHANGELOG-vectorscan.md
+++ b/CHANGELOG-vectorscan.md
@ -2,6 +2,39 @@

 This is a list of notable changes to Vectorscan, in reverse chronological order. For Hyperscan Changelog, check CHANGELOG.md

+## [5.4.12] 2025-07-21
+
+Multiple changes since last release, this will be the last 100% ABI and API compatible with Hyperscan release.
+Next versions will include major refactors and API extensions, it will be mostly backwards compatible however.
+Without particular order, platform support is now:
+
+* Linux (x86, Arm, Power)
+* FreeBSD 14 (x86, Arm, Power)
+* MacOS 14+ (x86, Arm)
+
+In total more than 200 configurations in the CI are tested for every PR.
+
+Other features:
+- Fat Runtime supported for Arm as well (ASIMD/SVE/SVE2).
+- Initial implementations for Arm SVE/SVE2 algorithms added, thanks to Yoan Picchi from Arm.
+- SIMDe support added, used as an alternative backend for existing platforms, but mostly interesting for allowing Vectorscan to build in new platforms without a supported SIMD engine.
+- Various speedups and optimizations.
+- Cppcheck and clang-tidy fixes throughout the code, both have been added to CI for multiple configurations, but only cppcheck triggers a build failure for now.
+
+Various bugfixes, most important listed:
+- Speed up truffle with 256b TBL instructions (#290)
+- Fix Clang Tidy warnings (#295)
+- Clang 17+ is more restrictive on rebind<T> on MacOS/Boost, remove warning (#332)
+- partial_load_u64 will fail if buf == NULL/c_len == 0 (#331)
+- Bugfix/fix avx512vbmi regressions (#335)
+- fix missing hs_version.h header (closes #198)
+- hs_valid_platform: Fix check for SSE4.2 (#310)
+- Fixed out of bounds read in AVX512VBMI version of fdr_exec_fat_teddy … (#333)
+- Fix noodle SVE2 off by one bug (#313)
+- Make vectorscan accept \0 starting pattern (#312)
+- Fix 5.4.11's config step regression (#327)
+- Fix double shufti's vector end false positive (#325)
+
 ## [5.4.11] 2023-11-19

 - Refactor CMake build system to be much more modular.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -4,7 +4,7 @@ project (vectorscan C CXX)

 set (HS_MAJOR_VERSION 5)
 set (HS_MINOR_VERSION 4)
-set (HS_PATCH_VERSION 11)
+set (HS_PATCH_VERSION 12)
 set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION})

 string (TIMESTAMP BUILD_DATE "%Y-%m-%d")
@ -23,11 +23,10 @@ INCLUDE (CheckLibraryExists)
 INCLUDE (CheckSymbolExists)
 include (CMakeDependentOption)
 include (GNUInstallDirs)
-include (${CMAKE_MODULE_PATH}/platform.cmake)
 include (${CMAKE_MODULE_PATH}/boost.cmake)
 include (${CMAKE_MODULE_PATH}/ragel.cmake)

-find_package(PkgConfig REQUIRED)
+find_package(PkgConfig QUIET)

 find_program(RAGEL ragel)

@ -35,6 +34,13 @@ if(${RAGEL} STREQUAL "RAGEL-NOTFOUND")
    message(FATAL_ERROR "Ragel state machine compiler not found")
 endif()

+# Add ccache to speed builds
+find_program(CCACHE_FOUND ccache)
+if(CCACHE_FOUND)
+    set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
+    set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ccache)
+endif(CCACHE_FOUND)
+
 # Build type check

 if (NOT CMAKE_BUILD_TYPE)
@ -116,18 +122,33 @@ if (RELEASE_BUILD)
    add_definitions(-DNDEBUG)
 endif()

+# Architecture detection
+
+include (${CMAKE_MODULE_PATH}/platform.cmake)
+
 # Detect OS and if Fat Runtime is available
 include (${CMAKE_MODULE_PATH}/osdetection.cmake)

-if (ARCH_IA32 OR ARCH_X86_64)
+if(ARCH_X86_64 AND BUILD_SSE2_SIMDE AND NOT FAT_RUNTIME)
+    set(SIMDE_BACKEND True)
+endif()
+
+if(SIMDE_BACKEND)
+    include (${CMAKE_MODULE_PATH}/simde.cmake)
+elseif (ARCH_IA32 OR ARCH_X86_64)
    include (${CMAKE_MODULE_PATH}/cflags-x86.cmake)
-    set(ARCH_FLAG march)
 elseif (ARCH_ARM32 OR ARCH_AARCH64)
    include (${CMAKE_MODULE_PATH}/cflags-arm.cmake)
-    set(ARCH_FLAG march)
 elseif (ARCH_PPC64EL)
    include (${CMAKE_MODULE_PATH}/cflags-ppc64le.cmake)
+else ()
+    message(FATAL_ERROR "Unsupported platform")
+endif ()
+
+if (ARCH_PPC64EL)
    set(ARCH_FLAG mcpu)
+else ()
+    set(ARCH_FLAG march)
 endif ()

 # Detect Native arch flags if requested
@ -139,9 +160,11 @@ include (${CMAKE_MODULE_PATH}/sanitize.cmake)

 if (NOT FAT_RUNTIME)
    if (GNUCC_TUNE)
+        message(STATUS "GNUCC_TUNE is set")
        set(ARCH_C_FLAGS "-${ARCH_FLAG}=${GNUCC_ARCH} -${TUNE_FLAG}=${GNUCC_TUNE}")
        set(ARCH_CXX_FLAGS "-${ARCH_FLAG}=${GNUCC_ARCH} -${TUNE_FLAG}=${GNUCC_TUNE}")
    else()
+        message(STATUS "GNUCC_TUNE is not set")
        set(ARCH_C_FLAGS "-${ARCH_FLAG}=${GNUCC_ARCH} -mtune=${TUNE_FLAG} ${ARCH_C_FLAGS}")
        set(ARCH_CXX_FLAGS "-${ARCH_FLAG}=${GNUCC_ARCH} -mtune=${TUNE_FLAG} ${ARCH_CXX_FLAGS}")
    endif()
@ -207,6 +230,19 @@ set_source_files_properties(

 ragelmaker(src/parser/control_verbs.rl)

+# BSD has the _np funcs in a _np header
+CHECK_INCLUDE_FILE_CXX(pthread_np.h HAVE_PTHREAD_NP_H)
+if (HAVE_PTHREAD_NP_H)
+    set (PTHREAD_NP_INC pthread_np.h)
+else ()
+    set (PTHREAD_NP_INC pthread.h)
+endif ()
+CHECK_CXX_SYMBOL_EXISTS(pthread_setaffinity_np ${PTHREAD_NP_INC} HAVE_DECL_PTHREAD_SETAFFINITY_NP)
+
+CHECK_FUNCTION_EXISTS(malloc_info HAVE_MALLOC_INFO)
+CHECK_FUNCTION_EXISTS(shmget HAVE_SHMGET)
+set(HAVE_SHMGET ${HAVE_SHMGET} CACHE BOOL "shmget()")
+
 # do substitutions
 configure_file(${CMAKE_MODULE_PATH}/config.h.in ${PROJECT_BINARY_DIR}/config.h)
 configure_file(src/hs_version.h.in ${PROJECT_BINARY_DIR}/hs_version.h)
@ -239,8 +275,11 @@ set (hs_exec_common_SRCS
    src/util/arch/common/cpuid_flags.h
    src/util/multibit.c
    )
-
-if (ARCH_IA32 OR ARCH_X86_64)
+if (SIMDE_BACKEND)
+set (hs_exec_common_SRCS
+    ${hs_exec_common_SRCS}
+    src/util/arch/simde/cpuid_flags.c)
+elseif (ARCH_IA32 OR ARCH_X86_64)
 set (hs_exec_common_SRCS
    ${hs_exec_common_SRCS}
    src/util/arch/x86/cpuid_flags.c
@ -275,7 +314,7 @@ set (hs_exec_SRCS
    src/fdr/fdr_confirm_runtime.h
    src/fdr/flood_runtime.h
    src/fdr/fdr_loadval.h
-    src/fdr/teddy.c
+    src/fdr/teddy.cpp
    src/fdr/teddy.h
    src/fdr/teddy_internal.h
    src/fdr/teddy_runtime_common.h
@ -398,7 +437,12 @@ set (hs_exec_SRCS
    src/database.h
 )

-if (ARCH_IA32 OR ARCH_X86_64)
+if (SIMDE_BACKEND)
+set (hs_exec_SRCS
+    ${hs_exec_SRCS}
+    src/nfa/vermicelli_simd.cpp
+    src/util/supervector/arch/x86/impl.cpp)
+elseif (ARCH_IA32 OR ARCH_X86_64)
 set (hs_exec_SRCS
    ${hs_exec_SRCS}
    src/nfa/vermicelli_simd.cpp
@ -414,9 +458,11 @@ set (hs_exec_SRCS
    src/util/supervector/arch/ppc64el/impl.cpp)
 endif()

+
 if (ARCH_IA32 OR ARCH_X86_64)
    set (hs_exec_avx2_SRCS
-        src/fdr/teddy_avx2.c
+        src/fdr/teddy.cpp
+        src/fdr/teddy_fat.cpp
        src/util/arch/x86/masked_move.c
        src/util/arch/x86/masked_move.h
    )
@ -918,16 +964,47 @@ else ()
        if (NOT BUILD_AVX512VBMI)
            set (DISPATCHER_DEFINE "${DISPATCHER_DEFINE} -DDISABLE_AVX512VBMI_DISPATCH")
        endif (NOT BUILD_AVX512VBMI)
+        if(BUILD_SSE2_SIMDE)
+            set (DISPATCHER_DEFINE "${DISPATCHER_DEFINE} -DVS_SIMDE_BACKEND -DVS_SIMDE_NATIVE -march=core2 -msse2")
+        endif(BUILD_SSE2_SIMDE)
        set_source_files_properties(src/dispatcher.c PROPERTIES
            COMPILE_FLAGS "-Wno-unused-parameter -Wno-unused-function ${DISPATCHER_DEFINE}")
+        if(BUILD_SSE2_SIMDE AND NOT BUILD_AVX2 AND NOT BUILD_AVX512 AND NOT BUILD_AVX512VBMI)
+            set_source_files_properties(src/crc32.c PROPERTIES
+                COMPILE_FLAGS "-DVS_SIMDE_BACKEND -DVS_SIMDE_NATIVE -march=core2 -msse2")
+            set_source_files_properties(src/hs.cpp PROPERTIES
+                COMPILE_FLAGS "-DVS_SIMDE_BACKEND -DVS_SIMDE_NATIVE -march=core2 -msse2")
+
+            string(REGEX REPLACE "-msse4.2" "-DVS_SIMDE_BACKEND -DVS_SIMDE_NATIVE -march=core2 -msse2" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
+            string(REGEX REPLACE "-msse4.2" "-DVS_SIMDE_BACKEND -DVS_SIMDE_NATIVE -march=core2 -msse2" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+        endif()

        if (BUILD_STATIC_LIBS)
-            add_library(hs_exec_core2 OBJECT ${hs_exec_SRCS})
-            list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_core2>)
-            set_target_properties(hs_exec_core2 PROPERTIES
-                COMPILE_FLAGS "-march=core2 -msse4.2"
-                RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} core2 ${CMAKE_MODULE_PATH}/keep.syms.in"
-                )
+
+            if (BUILD_SSE2_SIMDE)
+              add_library(hs_exec_core2 OBJECT ${hs_exec_SRCS})
+              list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_core2>)
+              message("Building SIMDE SSE2 version..")
+              include_directories(${PROJECT_SOURCE_DIR}/simde)
+              if (CMAKE_COMPILER_IS_CLANG)
+                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DSIMDE_NO_CHECK_IMMEDIATE_CONSTANT")
+                set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSIMDE_NO_CHECK_IMMEDIATE_CONSTANT")
+              endif()
+
+              set_target_properties(hs_exec_core2 PROPERTIES
+                  string(REGEX REPLACE "-msse4.2" "-msse2" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
+                  string(REGEX REPLACE "-msse4.2" "-msse2" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+                  COMPILE_FLAGS "-DVS_SIMDE_BACKEND -DVS_SIMDE_NATIVE -march=core2 -msse2"
+                  RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} core2 ${CMAKE_MODULE_PATH}/keep.syms.in"
+                  )
+            else()
+              add_library(hs_exec_core2 OBJECT ${hs_exec_SRCS})
+              list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_core2>)
+              set_target_properties(hs_exec_core2 PROPERTIES
+                  COMPILE_FLAGS "-march=core2 -msse4.2"
+                  RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} core2 ${CMAKE_MODULE_PATH}/keep.syms.in"
+                  )
+	    endif(BUILD_SSE2_SIMDE)

            add_library(hs_exec_corei7 OBJECT ${hs_exec_SRCS})
            list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_corei7>)
@ -980,19 +1057,59 @@ else ()
                $<TARGET_OBJECTS:hs_compile>
                $<TARGET_OBJECTS:hs_exec_common>
                ${RUNTIME_LIBS})
+
+            if (BUILD_SSE2_SIMDE)
+                set_target_properties(hs_compile PROPERTIES
+                  string(REGEX REPLACE "-msse4.2" "-msse2" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
+                  string(REGEX REPLACE "-msse4.2" "-msse2" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+                  COMPILE_FLAGS "-DVS_SIMDE_BACKEND -DVS_SIMDE_NATIVE -march=core2 -msse2"
+                )
+                set_target_properties(hs PROPERTIES
+                  string(REGEX REPLACE "-msse4.2" "-msse2" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
+                  string(REGEX REPLACE "-msse4.2" "-msse2" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+                  COMPILE_FLAGS "-DVS_SIMDE_BACKEND -DVS_SIMDE_NATIVE -march=core2 -msse2"
+                )
+            endif (BUILD_SSE2_SIMDE)
+
        endif (BUILD_STATIC_LIBS)

        if (BUILD_SHARED_LIBS)
            # build shared libs
            add_library(hs_compile_shared OBJECT ${hs_compile_SRCS})
            set_target_properties(hs_compile_shared PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
-            add_library(hs_exec_shared_core2 OBJECT ${hs_exec_SRCS})
-            list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_core2>)
-            set_target_properties(hs_exec_shared_core2 PROPERTIES
-                COMPILE_FLAGS "-march=core2 -msse4.2"
-                POSITION_INDEPENDENT_CODE TRUE
-                RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} core2 ${CMAKE_MODULE_PATH}/keep.syms.in"
-                )
+
+            if (BUILD_SSE2_SIMDE)
+              message("Building SIMDE SSE2 version..")
+              add_library(hs_exec_shared_core2 OBJECT ${hs_exec_SRCS})
+              list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_core2>)
+
+              include_directories(${PROJECT_SOURCE_DIR}/simde)
+              if (CMAKE_COMPILER_IS_CLANG)
+                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DSIMDE_NO_CHECK_IMMEDIATE_CONSTANT")
+                set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSIMDE_NO_CHECK_IMMEDIATE_CONSTANT")
+              endif()
+              set_target_properties(hs_exec_shared_core2 PROPERTIES
+                  string(REGEX REPLACE "-msse4.2" "-msse2" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
+                  string(REGEX REPLACE "-msse4.2" "-msse2" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+                  COMPILE_FLAGS "-DVS_SIMDE_BACKEND -DVS_SIMDE_NATIVE -march=core2 -msse2"
+                  POSITION_INDEPENDENT_CODE TRUE
+                  RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} core2 ${CMAKE_MODULE_PATH}/keep.syms.in"
+                  )
+              set_target_properties(hs_compile_shared PROPERTIES
+                  string(REGEX REPLACE "-msse4.2" "-msse2" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
+                  string(REGEX REPLACE "-msse4.2" "-msse2" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+                  COMPILE_FLAGS "-DVS_SIMDE_BACKEND -DVS_SIMDE_NATIVE -march=core2 -msse2"
+                  )
+            else()
+              add_library(hs_exec_shared_core2 OBJECT ${hs_exec_SRCS})
+              list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_core2>)
+              set_target_properties(hs_exec_shared_core2 PROPERTIES
+                  COMPILE_FLAGS "-march=core2 -msse4.2"
+                  POSITION_INDEPENDENT_CODE TRUE
+                  RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} core2 ${CMAKE_MODULE_PATH}/keep.syms.in"
+                  )
+            endif (BUILD_SSE2_SIMDE)
+
            add_library(hs_exec_shared_corei7 OBJECT ${hs_exec_SRCS})
            list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_corei7>)
            set_target_properties(hs_exec_shared_corei7 PROPERTIES
@ -1194,11 +1311,17 @@ if (NOT BUILD_STATIC_LIBS)
 endif ()

 add_subdirectory(util)
-add_subdirectory(unit)

-if (EXISTS ${CMAKE_SOURCE_DIR}/tools/CMakeLists.txt)
+option(BUILD_UNIT "Build Hyperscan unit tests (default TRUE)" TRUE)
+if(BUILD_UNIT)
+    add_subdirectory(unit)
+endif()
+
+option(BUILD_TOOLS "Build Hyperscan tools (default TRUE)" TRUE)
+if(EXISTS ${CMAKE_SOURCE_DIR}/tools/CMakeLists.txt AND BUILD_TOOLS)
    add_subdirectory(tools)
 endif()
+
 if (EXISTS ${CMAKE_SOURCE_DIR}/chimera/CMakeLists.txt AND BUILD_CHIMERA)
    add_subdirectory(chimera)
 endif()
@ -1213,4 +1336,7 @@ if(BUILD_BENCHMARKS)
    add_subdirectory(benchmarks)
 endif()

-add_subdirectory(doc/dev-reference)
+option(BUILD_DOC "Build the Hyperscan documentation (default TRUE)" TRUE)
+if(BUILD_DOC)
+    add_subdirectory(doc/dev-reference)
+endif()
--- a/Contributors-vectorscan.md
+++ b/Contributors-vectorscan.md
@ -1,4 +1,6 @@
-   394	Konstantinos Margaritis <konstantinos@vectorcamp.gr>
+   549	Konstantinos Margaritis <konstantinos@vectorcamp.gr>
+    78  George Economou <george.economou@vectorcamp.gr>
+    71  George Tsoulkanakis <george.tsoulkanakis@vectorcamp.gr>
    59	apostolos <apostolos.tapsas@vectorcamp.gr>
    25	Hong, Yang A <yang.a.hong@intel.com>
    19	George Wort <george.wort@arm.com>
@ -6,20 +8,29 @@
     7	Danila Kutenin <danilak@google.com>
     7	Wang Xiang W <xiang.w.wang@intel.com>
     6	Alex Bondarev <abondarev84@gmail.com>
-     5	Konstantinos Margaritis <konma@vectorcamp.gr>
+     6  Yoan Picchi <yoan.picchi@arm.com>
+     5  Jeremy Linton <jeremy.linton@arm.com>
     3	Duncan Bellamy <dunk@denkimushi.com>
     2	Azat Khuzhin <a3at.mail@gmail.com>
     2	Jan Henning <jan.thilo.henning@sap.com>
     1	BigRedEye <mail@bigredeye.me>
+     1  Brad Larsen <bradford.larsen@praetorian.com>
+     1  Chrysovalantis - Michail Liakopoulos <valadis.liakopoulos@vectorcamp.gr>
     1	Daniel Kutenin <kutdanila@yandex.ru>
     1	Danila Kutenin <kutdanila@yandex.ru>
+     1  HelixHexagon <60048780+HelixHexagon@users.noreply.github.com>
+     1  Jingbo Chen <cj@yanhuangdata.com>
     1	Liu Zixian <hdu_sdlzx@163.com>
+     1  Matthias Gliwka <matthias@gliwka.eu>
+     1  Michael Tremer <michael.tremer@ipfire.org>
     1	Mitchell Wasson <miwasson@cisco.com>
     1	Piotr Skamruk <piotr.skamruk@gmail.com>
+     1  Rafał Dowgird <dowgird@gmail.com>
     1	Robbie Williamson <robbie.williamson@arm.com>
     1	Robert Schulze <robert@clickhouse.com>
     1	Walt Stoneburner <wls@wwco.com>
     1	Zhu,Wenjun <wenjun.zhu@intel.com>
     1	hongyang7 <yang.a.hong@intel.com>
+     1  ibrkas01arm <ibrahim.kashif@arm.com>
     1	jplaisance <jeffplaisance@gmail.com>
     1	liquidaty <info@liquidaty.com>
--- a/README.md
+++ b/README.md
@ -1,8 +1,12 @@
 # About Vectorscan

 A fork of Intel's Hyperscan, modified to run on more platforms. Currently ARM NEON/ASIMD
-is 100% functional, and Power VSX are in development. ARM SVE2 support is in ongoing with
+and Power VSX are 100% functional. ARM SVE2 support is in ongoing with
 access to hardware now. More platforms will follow in the future.
+Further more, starting 5.4.12 there is now a [SIMDe](https://github.com/simd-everywhere/simde)
+port, which can be either used for platforms without official SIMD support,
+as SIMDe can emulate SIMD instructions, or as an alternative backend for existing architectures,
+for reference and comparison purposes.

 Vectorscan will follow Intel's API and internal algorithms where possible, but will not
 hesitate to make code changes where it is thought of giving better performance or better
@ -94,7 +98,7 @@ some small but necessary changes were made that might break compatibility with h
 In order to build on Debian/Ubuntu make sure you install the following build-dependencies

 ```
-$ sudo apt build-essential cmake ragel pkg-config libsqlite3-dev libpcap-dev
+$ sudo apt install build-essential cmake ragel pkg-config libsqlite3-dev libpcap-dev
 ```

 ### Other distributions
@ -109,6 +113,69 @@ Assuming an existing HomeBrew installation:
 % brew install boost cmake gcc libpcap pkg-config ragel sqlite
 ```

+### *BSD
+In NetBSD you will almost certainly need to have a newer compiler installed. 
+Also you will need to install cmake, sqlite, boost and ragel. 
+Also, libpcap is necessary for some of the benchmarks, so let's install that 
+as well.
+When using pkgsrc, you would typically do this using something
+similar to
+```
+pkg_add gcc12-12.3.0.tgz
+pkg_add boost-headers-1.83.0.tgz  boost-jam-1.83.0.tgz      boost-libs-1.83.0nb1.tgz
+pkg_add ragel-6.10.tgz
+pkg_add cmake-3.28.1.tgz
+pkg_add sqlite3-3.44.2.tgz
+pkg_add libpcap-1.10.4.tgz
+```
+Version numbers etc will of course vary. One would either download the
+binary packages or build them using pkgsrc. There exist some NetBSD pkg 
+tools like ```pkgin``` which help download e.g. dependencies as binary packages,
+but overall NetBSD leaves a lot of detail exposed to the user.
+The main package system used in NetBSD is pkgsrc and one will probably
+want to read up more about it than is in the scope of this document.
+See https://www.netbsd.org/docs/software/packages.html for more information.
+
+This will not replace the compiler in the standard base distribution, and
+cmake will probably find the base dist's compiler when it checks automatically.
+Using the example of gcc12 from pkgsrc, one will need to set two
+environment variables before starting: 
+```
+export CC="/usr/pkg/gcc12/bin/cc"
+export CXX="/usr/pkg/gcc12/bin/g++"
+```
+
+In FreeBSD similarly, you might want to install a different compiler.
+If you want to use gcc, it is recommended to use gcc12.
+You will also, as in NetBSD, need to install cmake, sqlite, boost and ragel packages.
+Using the example of gcc12 from pkg:
+installing the desired compiler: 
+```
+pkg install gcc12
+pkg install boost-all
+pkg install ragel
+pkg install cmake
+pkg install sqlite
+pkg install libpcap
+pkg install ccache
+```
+and then before beginning the cmake and build process, set
+the environment variables to point to this compiler: 
+```
+export CC="/usr/local/bin/gcc"
+export CXX="/usr/local/bin/g++"
+```
+A further note in FreeBSD, on the PowerPC and ARM platforms, 
+the gcc12 package installs to a slightly different name, on FreeBSD/ppc, 
+gcc12 will be found using: 
+```
+export CC="/usr/local/bin/gcc12"
+export CXX="/usr/local/bin/g++12"
+```
+
+Then continue with the build as below. 
+
+
 ## Configure & build

 In order to configure with `cmake` first create and cd into a build directory:
@ -148,6 +215,11 @@ Common options for Cmake are:

 * `SANITIZE=[address|memory|undefined]` (experimental) Use `libasan` sanitizer to detect possible bugs. For now only `address` is tested. This will eventually be integrated in the CI.

+## SIMDe options
+
+* `SIMDE_BACKEND=[On|Off]` Enable SIMDe backend. If this is chosen all native (SSE/AVX/AVX512/Neon/SVE/VSX) backends will be disabled and a SIMDe SSE4.2 emulation backend will be enabled. This will enable Vectorscan to build and run on architectures without SIMD.
+* `SIMDE_NATIVE=[On|Off]` Enable SIMDe native emulation of x86 SSE4.2 intrinsics on the building platform. That is, SSE4.2 intrinsics will be emulated using Neon on an Arm platform, or VSX on a Power platform, etc.
+
 ## Build

 If `cmake` has completed successfully you can run `make` in the same directory, if you have a multi-core system with `N` cores, running
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@ -1,4 +1,7 @@
-if (NOT FAT_RUNTIME AND (BUILD_STATIC_AND_SHARED OR BUILD_STATIC_LIBS))
+include_directories(SYSTEM ${CMAKE_CURRENT_SOURCE_DIR})
+include_directories(${PROJECT_SOURCE_DIR})
+
+if (NOT FAT_RUNTIME AND (BUILD_SHARED_LIBS OR BUILD_STATIC_LIBS))
  add_executable(benchmarks benchmarks.cpp)
  set_source_files_properties(benchmarks.cpp PROPERTIES COMPILE_FLAGS
      "-Wall -Wno-unused-variable")
--- a/benchmarks/benchmarks.cpp
+++ b/benchmarks/benchmarks.cpp
@ -1,5 +1,6 @@
 /*
 * Copyright (c) 2020, 2021, VectorCamp PC
+ * Copyright (c) 2023, 2024, Arm Limited
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -26,32 +27,31 @@
 * POSSIBILITY OF SUCH DAMAGE.
 */

-#include <iostream>
 #include <chrono>
+#include <cstdlib>
 #include <cstring>
 #include <ctime>
-#include <cstdlib>
-#include <memory>
 #include <functional>
+#include <iostream>
+#include <memory>

+#include "util/arch.h"
 #include "benchmarks.hpp"

-#define MAX_LOOPS    1000000000
-#define MAX_MATCHES  5
-#define N            8
+#define MAX_LOOPS 1000000000
+#define MAX_MATCHES 5
+#define N 8

 struct hlmMatchEntry {
    size_t to;
    u32 id;
-    hlmMatchEntry(size_t end, u32 identifier) :
-            to(end), id(identifier) {}
+    hlmMatchEntry(size_t end, u32 identifier) : to(end), id(identifier) {}
 };

 std::vector<hlmMatchEntry> ctxt;

-static
-hwlmcb_rv_t hlmSimpleCallback(size_t to, u32 id,
-                              UNUSED struct hs_scratch *scratch) {
+static hwlmcb_rv_t hlmSimpleCallback(size_t to, u32 id,
+                                     UNUSED struct hs_scratch *scratch) { // cppcheck-suppress constParameterCallback
    DEBUG_PRINTF("match @%zu = %u\n", to, id);

    ctxt.push_back(hlmMatchEntry(to, id));
@ -59,40 +59,42 @@ hwlmcb_rv_t hlmSimpleCallback(size_t to, u32 id,
    return HWLM_CONTINUE_MATCHING;
 }

-template<typename InitFunc, typename BenchFunc>
-static void run_benchmarks(int size, int loops, int max_matches, bool is_reverse, MicroBenchmark &bench, InitFunc &&init, BenchFunc &&func) {
+template <typename InitFunc, typename BenchFunc>
+static void run_benchmarks(int size, int loops, int max_matches,
+                           bool is_reverse, MicroBenchmark &bench,
+                           InitFunc &&init, BenchFunc &&func) {
    init(bench);
    double total_sec = 0.0;
-    u64a total_size = 0;
-    double bw = 0.0;
-    double avg_bw = 0.0;
    double max_bw = 0.0;
    double avg_time = 0.0;
    if (max_matches) {
+        double avg_bw = 0.0;
        int pos = 0;
-        for(int j = 0; j < max_matches - 1; j++) {
+        for (int j = 0; j < max_matches - 1; j++) {
            bench.buf[pos] = 'b';
-            pos = (j+1) *size / max_matches ;
+            pos = (j + 1) * size / max_matches;
            bench.buf[pos] = 'a';
            u64a actual_size = 0;
            auto start = std::chrono::steady_clock::now();
-            for(int i = 0; i < loops; i++) { 
+            for (int i = 0; i < loops; i++) {
                const u8 *res = func(bench);
-		if (is_reverse)
-		   actual_size += bench.buf.data() + size - res;
-		else
-                   actual_size += res - bench.buf.data();
+                if (is_reverse)
+                    actual_size += bench.buf.data() + size - res;
+                else
+                    actual_size += res - bench.buf.data();
            }
            auto end = std::chrono::steady_clock::now();
-            double dt = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
+            double dt = std::chrono::duration_cast<std::chrono::microseconds>(
+                            end - start)
+                            .count();
            total_sec += dt;
            /*convert microseconds to seconds*/
            /*calculate bandwidth*/
-            bw  = (actual_size / dt) * 1000000.0 / 1048576.0;
-	    /*std::cout << "act_size = " << act_size << std::endl;
-	    std::cout << "dt = " << dt << std::endl;
-	    std::cout << "bw = " << bw << std::endl;*/
-	    avg_bw += bw;
+            double bw = (actual_size / dt) * 1000000.0 / 1048576.0;
+            /*std::cout << "act_size = " << act_size << std::endl;
+            std::cout << "dt = " << dt << std::endl;
+            std::cout << "bw = " << bw << std::endl;*/
+            avg_bw += bw;
            /*convert to MB/s*/
            max_bw = std::max(bw, max_bw);
            /*calculate average time*/
@ -100,20 +102,22 @@ static void run_benchmarks(int size, int loops, int max_matches, bool is_reverse
        }
        avg_time /= max_matches;
        avg_bw /= max_matches;
-	total_sec /= 1000000.0;
+        total_sec /= 1000000.0;
        /*convert average time to us*/
-        printf(KMAG "%s: %u matches, %u * %u iterations," KBLU " total elapsed time =" RST " %.3f s, " 
-               KBLU "average time per call =" RST " %.3f μs," KBLU " max bandwidth = " RST " %.3f MB/s," KBLU " average bandwidth =" RST " %.3f MB/s \n",
+        printf("%-18s, %-12d, %-10d, %-6d, %-10.3f, %-9.3f, %-8.3f, %-7.3f\n",
               bench.label, max_matches, size ,loops, total_sec, avg_time, max_bw, avg_bw);
    } else {
+        u64a total_size = 0;
        auto start = std::chrono::steady_clock::now();
        for (int i = 0; i < loops; i++) {
-            const u8 *res = func(bench);
+            func(bench);
        }
        auto end = std::chrono::steady_clock::now();
-        total_sec += std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
+        total_sec +=
+            std::chrono::duration_cast<std::chrono::microseconds>(end - start)
+                .count();
        /*calculate transferred size*/
-        total_size = size * loops;
+        total_size = (u64a)size * (u64a)loops;
        /*calculate average time*/
        avg_time = total_sec / loops;
        /*convert microseconds to seconds*/
@ -122,130 +126,182 @@ static void run_benchmarks(int size, int loops, int max_matches, bool is_reverse
        max_bw = total_size / total_sec;
        /*convert to MB/s*/
        max_bw /= 1048576.0;
-        printf(KMAG "%s: no matches, %u * %u iterations," KBLU " total elapsed time =" RST " %.3f s, " 
-               KBLU "average time per call =" RST " %.3f μs ," KBLU " bandwidth = " RST " %.3f MB/s \n",
-               bench.label, size ,loops, total_sec, avg_time, max_bw );
+        printf("%-18s, %-12s, %-10d, %-6d, %-10.3f, %-9.3f, %-8.3f, %-7s\n",
+               bench.label, "0", size, loops, total_sec, avg_time, max_bw, "0");
    }
 }

 int main(){
-    int matches[] = {0, MAX_MATCHES};
+    const int matches[] = {0, MAX_MATCHES};
    std::vector<size_t> sizes;
-    for (size_t i = 0; i < N; i++) sizes.push_back(16000 << i*2);
+    for (size_t i = 0; i < N; i++)
+        sizes.push_back(16000 << i * 2);
    const char charset[] = "aAaAaAaAAAaaaaAAAAaaaaAAAAAAaaaAAaaa";
-  
+    printf("%-18s, %-12s, %-10s, %-6s, %-10s, %-9s, %-8s, %-7s\n", "Matcher",
+           "max_matches", "size", "loops", "total_sec", "avg_time", "max_bw",
+           "avg_bw");
    for (int m = 0; m < 2; m++) {
        for (size_t i = 0; i < std::size(sizes); i++) {
            MicroBenchmark bench("Shufti", sizes[i]);
-            run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
+            run_benchmarks(
+                sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
                [&](MicroBenchmark &b) {
                    b.chars.set('a');
-                    ue2::shuftiBuildMasks(b.chars, (u8 *)&b.lo, (u8 *)&b.hi);
+                    ue2::shuftiBuildMasks(b.chars,
+                                          reinterpret_cast<u8 *>(&b.truffle_mask_lo),
+                                          reinterpret_cast<u8 *>(&b.truffle_mask_hi));
                    memset(b.buf.data(), 'b', b.size);
                },
-                [&](MicroBenchmark &b) {
-                    return shuftiExec(b.lo, b.hi, b.buf.data(), b.buf.data() + b.size);
-                }
-            );
+                [&](MicroBenchmark const &b) {
+                    return shuftiExec(b.truffle_mask_lo, b.truffle_mask_hi, b.buf.data(),
+                                      b.buf.data() + b.size);
+                });
        }

        for (size_t i = 0; i < std::size(sizes); i++) {
            MicroBenchmark bench("Reverse Shufti", sizes[i]);
-            run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench,
+            run_benchmarks(
+                sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench,
                [&](MicroBenchmark &b) {
                    b.chars.set('a');
-                    ue2::shuftiBuildMasks(b.chars, (u8 *)&b.lo, (u8 *)&b.hi);
+                    ue2::shuftiBuildMasks(b.chars,
+                                          reinterpret_cast<u8 *>(&b.truffle_mask_lo),
+                                          reinterpret_cast<u8 *>(&b.truffle_mask_hi));
                    memset(b.buf.data(), 'b', b.size);
                },
-                [&](MicroBenchmark &b) {
-                    return rshuftiExec(b.lo, b.hi, b.buf.data(), b.buf.data() + b.size);
-                }
-            );
+                [&](MicroBenchmark const &b) {
+                    return rshuftiExec(b.truffle_mask_lo, b.truffle_mask_hi, b.buf.data(),
+                                       b.buf.data() + b.size);
+                });
        }

        for (size_t i = 0; i < std::size(sizes); i++) {
            MicroBenchmark bench("Truffle", sizes[i]);
-            run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
+            run_benchmarks(
+                sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
                [&](MicroBenchmark &b) {
                    b.chars.set('a');
-                    ue2::truffleBuildMasks(b.chars, (u8 *)&b.lo, (u8 *)&b.hi);
+                    ue2::truffleBuildMasks(b.chars,
+                                           reinterpret_cast<u8 *>(&b.truffle_mask_lo),
+                                           reinterpret_cast<u8 *>(&b.truffle_mask_hi));
                    memset(b.buf.data(), 'b', b.size);
                },
-                [&](MicroBenchmark &b) {
-                    return truffleExec(b.lo, b.hi, b.buf.data(), b.buf.data() + b.size);
-                }
-            );
+                [&](MicroBenchmark const &b) {
+                    return truffleExec(b.truffle_mask_lo, b.truffle_mask_hi, b.buf.data(),
+                                       b.buf.data() + b.size);
+                });
        }

        for (size_t i = 0; i < std::size(sizes); i++) {
            MicroBenchmark bench("Reverse Truffle", sizes[i]);
-            run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench,
+            run_benchmarks(
+                sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench,
                [&](MicroBenchmark &b) {
                    b.chars.set('a');
-                    ue2::truffleBuildMasks(b.chars, (u8 *)&b.lo, (u8 *)&b.hi);
+                    ue2::truffleBuildMasks(b.chars,
+                                           reinterpret_cast<u8 *>(&b.truffle_mask_lo),
+                                           reinterpret_cast<u8 *>(&b.truffle_mask_hi));
                    memset(b.buf.data(), 'b', b.size);
                },
-                [&](MicroBenchmark &b) {
-                    return rtruffleExec(b.lo, b.hi, b.buf.data(), b.buf.data() + b.size);
-                }
-            );
+                [&](MicroBenchmark const &b) {
+                    return rtruffleExec(b.truffle_mask_lo, b.truffle_mask_hi, b.buf.data(),
+                                        b.buf.data() + b.size);
+                });
        }
+#ifdef CAN_USE_WIDE_TRUFFLE
+        if(CAN_USE_WIDE_TRUFFLE) {
+            for (size_t i = 0; i < std::size(sizes); i++) {
+                MicroBenchmark bench("Truffle Wide", sizes[i]);
+                run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
+                    [&](MicroBenchmark &b) {
+                        b.chars.set('a');
+                        ue2::truffleBuildMasksWide(b.chars, reinterpret_cast<u8 *>(&b.truffle_mask));
+                        memset(b.buf.data(), 'b', b.size);
+                    },
+                    [&](MicroBenchmark const &b) {
+                        return truffleExecWide(b.truffle_mask, b.buf.data(), b.buf.data() + b.size);
+                    }
+                );
+            }
+
+            for (size_t i = 0; i < std::size(sizes); i++) {
+                MicroBenchmark bench("Reverse Truffle Wide", sizes[i]);
+                run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench,
+                    [&](MicroBenchmark &b) {
+                        b.chars.set('a');
+                        ue2::truffleBuildMasksWide(b.chars, reinterpret_cast<u8 *>(&b.truffle_mask));
+                        memset(b.buf.data(), 'b', b.size);
+                    },
+                    [&](MicroBenchmark const &b) {
+                        return rtruffleExecWide(b.truffle_mask, b.buf.data(), b.buf.data() + b.size);
+                    }
+                );
+            }
+        }
+#endif

        for (size_t i = 0; i < std::size(sizes); i++) {
            MicroBenchmark bench("Vermicelli", sizes[i]);
-            run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
+            run_benchmarks(
+                sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
                [&](MicroBenchmark &b) {
                    b.chars.set('a');
-                    ue2::truffleBuildMasks(b.chars, (u8 *)&b.lo, (u8 *)&b.hi);
+                    ue2::truffleBuildMasks(b.chars,
+                                           reinterpret_cast<u8 *>(&b.truffle_mask_lo),
+                                           reinterpret_cast<u8 *>(&b.truffle_mask_hi));
                    memset(b.buf.data(), 'b', b.size);
                },
-                [&](MicroBenchmark &b) {
-                    return vermicelliExec('a', 'b', b.buf.data(), b.buf.data() + b.size);
-                }
-            );
+                [&](MicroBenchmark const &b) {
+                    return vermicelliExec('a', 'b', b.buf.data(),
+                                          b.buf.data() + b.size);
+                });
        }

        for (size_t i = 0; i < std::size(sizes); i++) {
            MicroBenchmark bench("Reverse Vermicelli", sizes[i]);
-            run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench,
+            run_benchmarks(
+                sizes[i], MAX_LOOPS / sizes[i], matches[m], true, bench,
                [&](MicroBenchmark &b) {
                    b.chars.set('a');
-                    ue2::truffleBuildMasks(b.chars, (u8 *)&b.lo, (u8 *)&b.hi);
+                    ue2::truffleBuildMasks(b.chars,
+                                           reinterpret_cast<u8 *>(&b.truffle_mask_lo),
+                                           reinterpret_cast<u8 *>(&b.truffle_mask_hi));
                    memset(b.buf.data(), 'b', b.size);
                },
-                [&](MicroBenchmark &b) {
-                    return rvermicelliExec('a', 'b', b.buf.data(), b.buf.data() + b.size);
-                }
-            );
+                [&](MicroBenchmark const &b) {
+                    return rvermicelliExec('a', 'b', b.buf.data(),
+                                           b.buf.data() + b.size);
+                });
        }

        for (size_t i = 0; i < std::size(sizes); i++) {
-            //we imitate the noodle unit tests
+            // we imitate the noodle unit tests
            std::string str;
            const size_t char_len = 5;
-            str.resize(char_len + 1);
-            for (size_t j=0; j < char_len; j++) {
-                srand (time(NULL));
-                int key = rand() % + 36 ;
+            str.resize(char_len + 2);
+            for (size_t j = 0; j < char_len; j++) {
+                srand(time(NULL));
+                int key = rand() % +36;
                str[char_len] = charset[key];
                str[char_len + 1] = '\0';
            }

            MicroBenchmark bench("Noodle", sizes[i]);
-            run_benchmarks(sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
+            run_benchmarks(
+                sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench,
                [&](MicroBenchmark &b) {
                    ctxt.clear();
                    memset(b.buf.data(), 'a', b.size);
                    u32 id = 1000;
                    ue2::hwlmLiteral lit(str, true, id);
                    b.nt = ue2::noodBuildTable(lit);
-                    assert(b.nt != nullptr);
+                    assert(b.nt.get() != nullptr);
                },
-                [&](MicroBenchmark &b) {
-                    noodExec(b.nt.get(), b.buf.data(), b.size, 0, hlmSimpleCallback, &b.scratch);
+                [&](MicroBenchmark &b) { // cppcheck-suppress constParameterReference
+                    noodExec(b.nt.get(), b.buf.data(), b.size, 0,
+                             hlmSimpleCallback, &b.scratch);
                    return b.buf.data() + b.size;
-                }
-           );
+                });
        }
    }

--- a/benchmarks/benchmarks.hpp
+++ b/benchmarks/benchmarks.hpp
@ -1,5 +1,6 @@
 /*
 * Copyright (c) 2020, 2021, VectorCamp PC
+ * Copyright (c) 2024, Arm Limited
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -26,44 +27,41 @@
 * POSSIBILITY OF SUCH DAMAGE.
 */

+#include "hwlm/hwlm_literal.h"
+#include "hwlm/noodle_build.h"
+#include "hwlm/noodle_engine.h"
+#include "hwlm/noodle_internal.h"
 #include "nfa/shufti.h"
 #include "nfa/shufticompile.h"
 #include "nfa/truffle.h"
 #include "nfa/trufflecompile.h"
 #include "nfa/vermicelli.hpp"
-#include "hwlm/noodle_build.h"
-#include "hwlm/noodle_engine.h"
-#include "hwlm/noodle_internal.h"
-#include "hwlm/hwlm_literal.h"
-#include "util/bytecode_ptr.h"
 #include "scratch.h"
+#include "util/bytecode_ptr.h"

-/*define colour control characters*/
-#define RST  "\x1B[0m"
-#define KRED  "\x1B[31m"
-#define KGRN  "\x1B[32m"
-#define KYEL  "\x1B[33m"
-#define KBLU  "\x1B[34m"
-#define KMAG  "\x1B[35m"
-#define KCYN  "\x1B[36m"
-#define KWHT  "\x1B[37m"
-
-class MicroBenchmark
-{
+class MicroBenchmark {
 public:
-  char const *label;
-  size_t size;
+    struct hs_scratch scratch{};
+    char const *label;
+    size_t size;
+    std::vector<u8> buf;
+    ue2::bytecode_ptr<noodTable> nt;
+    ue2::CharReach chars;

-  // Shufti/Truffle
-  m128 lo, hi;
-  ue2::CharReach chars;
-  std::vector<u8> buf;
+    // Shufti/Truffle
+    union {
+        m256 truffle_mask;
+        struct {
+#if (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
+            m128 truffle_mask_lo;
+            m128 truffle_mask_hi;
+#else
+            m128 truffle_mask_hi;
+            m128 truffle_mask_lo;
+#endif
+        };
+    };

-  // Noodle
-  struct hs_scratch scratch;
-  ue2::bytecode_ptr<noodTable> nt;
-
-  MicroBenchmark(char const *label_, size_t size_)
-  :label(label_), size(size_), buf(size_) {
-  };
+    MicroBenchmark(char const *label_, size_t size_)
+        : label(label_), size(size_), buf(size_){};
 };
--- a/cmake/archdetect.cmake
+++ b/cmake/archdetect.cmake
@ -9,22 +9,35 @@ if (USE_CPU_NATIVE)
        # the flag), so use that for tune.

        set(TUNE_FLAG "mtune")
-        set(GNUCC_TUNE "")
+
+        # set the default fallback values for the arch and tune to native, in case we can't parse them properly later
+        set(GNUCC_ARCH "native") 
+        set(GNUCC_TUNE "native")
        message(STATUS "ARCH_FLAG '${ARCH_FLAG}' '${GNUCC_ARCH}', TUNE_FLAG '${TUNE_FLAG}' '${GNUCC_TUNE}' ")

        # arg1 might exist if using ccache
        string (STRIP "${CMAKE_C_COMPILER_ARG1}" CC_ARG1)
-        set (EXEC_ARGS ${CC_ARG1} -c -Q --help=target -${ARCH_FLAG}=native -${TUNE_FLAG}=native)
+        set (EXEC_ARGS ${CC_ARG1} -c -Q --help=target -${ARCH_FLAG}=${GNUCC_ARCH} -${TUNE_FLAG}=${GNUCC_TUNE})
        execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS}
            OUTPUT_VARIABLE _GCC_OUTPUT)
        set(_GCC_OUTPUT_TUNE ${_GCC_OUTPUT})
        string(FIND "${_GCC_OUTPUT}" "${ARCH_FLAG}=" POS)
        string(SUBSTRING "${_GCC_OUTPUT}" ${POS} -1 _GCC_OUTPUT)
-        string(REGEX REPLACE "${ARCH_FLAG}=[ \t]*([^ \n]*)[ \n].*" "\\1" GNUCC_ARCH "${_GCC_OUTPUT}")
+        string(REGEX REPLACE "${ARCH_FLAG}=[ \t]*([^ \n]*)[ \n].*" "\\1" _GNUCC_ARCH "${_GCC_OUTPUT}")
+
+        # Only overwrite arch if non-empty
+        if(NOT _GNUCC_ARCH STREQUAL "")
+            set(GNUCC_ARCH ${_GNUCC_ARCH})
+        endif()

        string(FIND "${_GCC_OUTPUT_TUNE}" "${TUNE_FLAG}=" POS_TUNE)
        string(SUBSTRING "${_GCC_OUTPUT_TUNE}" ${POS_TUNE} -1 _GCC_OUTPUT_TUNE)
-        string(REGEX REPLACE "${TUNE_FLAG}=[ \t]*([^ \n]*)[ \n].*" "\\1" GNUCC_TUNE "${_GCC_OUTPUT_TUNE}")
+        string(REGEX REPLACE "${TUNE_FLAG}=[ \t]*([^ \n]*)[ \n].*" "\\1" _GNUCC_TUNE "${_GCC_OUTPUT_TUNE}")
+
+        # Only overwrite tune if non-empty
+        if (NOT _GNUCC_TUNE STREQUAL "")
+            set(GNUCC_TUNE ${_GNUCC_TUNE})
+        endif()

        message(STATUS "ARCH_FLAG '${ARCH_FLAG}' '${GNUCC_ARCH}', TUNE_FLAG '${TUNE_FLAG}' '${GNUCC_TUNE}' ")

@ -44,7 +57,7 @@ if (USE_CPU_NATIVE)
        endif()
    elseif (CMAKE_COMPILER_IS_CLANG)
        if (ARCH_IA32 OR ARCH_X86_64)
-            set(GNUCC_ARCH x86_64_v2)
+            set(GNUCC_ARCH x86-64-v2)
            set(TUNE_FLAG generic)
        elseif(ARCH_AARCH64)
            if (BUILD_SVE2_BITPERM)
@ -67,8 +80,25 @@ if (USE_CPU_NATIVE)
        message(STATUS "clang will tune for ${GNUCC_ARCH}, ${TUNE_FLAG}")
    endif()
 else()
-    if (ARCH_IA32 OR ARCH_X86_64)
-        set(GNUCC_ARCH native)
+    if (SIMDE_BACKEND)
+        if (ARCH_IA32 OR ARCH_X86_64)
+            set(GNUCC_ARCH x86-64-v2)
+            set(TUNE_FLAG generic)
+        elseif(ARCH_AARCH64)
+            set(GNUCC_ARCH armv8-a)
+            set(TUNE_FLAG generic)
+        elseif(ARCH_ARM32)
+            set(GNUCC_ARCH armv7a)
+            set(TUNE_FLAG generic)
+        elseif(ARCH_PPC64EL)
+            set(GNUCC_ARCH power8)
+            set(TUNE_FLAG power8)
+        else()
+            set(GNUCC_ARCH x86-64-v2)
+            set(TUNE_FLAG generic)
+        endif()
+    elseif (ARCH_IA32 OR ARCH_X86_64)
+        set(GNUCC_ARCH ${X86_ARCH})
        set(TUNE_FLAG generic)
    elseif(ARCH_AARCH64)
        if (BUILD_SVE2_BITPERM)
@ -84,8 +114,11 @@ else()
    elseif(ARCH_ARM32)
       set(GNUCC_ARCH armv7a)
       set(TUNE_FLAG generic)
+    elseif(ARCH_PPC64EL)
+       set(GNUCC_ARCH power8)
+       set(TUNE_FLAG power8)
    else()
-       set(GNUCC_ARCH power9)
-       set(TUNE_FLAG power9)
+       set(GNUCC_ARCH native)
+       set(TUNE_FLAG native)
    endif()
 endif()
--- a/cmake/build_wrapper.sh
+++ b/cmake/build_wrapper.sh
@ -15,13 +15,21 @@ SYMSFILE=$(mktemp -p /tmp ${PREFIX}_rename.syms.XXXXX)
 KEEPSYMS=$(mktemp -p /tmp keep.syms.XXXXX)
 # find the libc used by gcc
 LIBC_SO=$("$@" --print-file-name=libc.so.6)
+NM_FLAG="-f"
+if [ `uname` = "FreeBSD" ]; then
+    # for freebsd, we will specify the name, 
+    # we will leave it work as is in linux
+    LIBC_SO=/lib/libc.so.7
+    # also, in BSD, the nm flag -F corresponds to the -f flag in linux.
+    NM_FLAG="-F"
+fi
 cp ${KEEPSYMS_IN} ${KEEPSYMS}
 # get all symbols from libc and turn them into patterns
-nm -f p -g -D ${LIBC_SO} | sed -s 's/\([^ @]*\).*/^\1$/' >> ${KEEPSYMS}
+nm ${NM_FLAG} p -g -D ${LIBC_SO} | sed 's/\([^ @]*\).*/^\1$/' >> ${KEEPSYMS}
 # build the object
 "$@"
 # rename the symbols in the object
-nm -f p -g ${OUT} | cut -f1 -d' ' | grep -v -f ${KEEPSYMS} | sed -e "s/\(.*\)/\1\ ${PREFIX}_\1/" >> ${SYMSFILE}
+nm ${NM_FLAG} p -g ${OUT} | cut -f1 -d' ' | grep -v -f ${KEEPSYMS} | sed -e "s/\(.*\)/\1\ ${PREFIX}_\1/" >> ${SYMSFILE}
 if test -s ${SYMSFILE}
 then
    objcopy --redefine-syms=${SYMSFILE} ${OUT}
--- a/cmake/cflags-generic.cmake
+++ b/cmake/cflags-generic.cmake
@ -1,22 +1,13 @@
 # set compiler flags - more are tested and added later
-set(EXTRA_C_FLAGS "${OPT_C_FLAG} -std=c17 -Wall -Wextra -Wshadow -Wcast-qual -fno-strict-aliasing")
-set(EXTRA_CXX_FLAGS "${OPT_CXX_FLAG} -std=c++17 -Wall -Wextra -Wshadow -Wswitch -Wreturn-type -Wcast-qual -Wno-deprecated -Wnon-virtual-dtor -fno-strict-aliasing")
+set(EXTRA_C_FLAGS "${OPT_C_FLAG} -std=c17 -Wall -Wextra ")
+set(EXTRA_CXX_FLAGS "${OPT_CXX_FLAG} -std=c++17 -Wall -Wextra ")
 if (NOT CMAKE_COMPILER_IS_CLANG)
    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fno-new-ttp-matching")
 endif()

-if (NOT RELEASE_BUILD)
-    # -Werror is most useful during development, don't potentially break
-    # release builds
-    set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Werror")
-    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Werror")
-    if (CMAKE_COMPILER_IS_CLANG)
-    	if (CMAKE_C_COMPILER_VERSION VERSION_GREATER "13.0")
-           set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-unused-but-set-variable")
-           set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-unused-but-set-variable")
-        endif()
-    endif()
-endif()
+# Always use -Werror *also during release builds
+set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wall -Werror")
+set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wall -Werror")

 if (DISABLE_ASSERTS)
    set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -DNDEBUG")
@ -25,28 +16,32 @@ endif()

 if(CMAKE_COMPILER_IS_GNUCC)
    # spurious warnings?
-    set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-array-bounds -Wno-maybe-uninitialized")
+    set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-array-bounds ") #-Wno-maybe-uninitialized")
 endif()

 if(CMAKE_COMPILER_IS_GNUCXX)
-    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-maybe-uninitialized")
-    if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0)
-        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fabi-version=0")
-    endif ()
-    # don't complain about abi
-    set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-abi")
-    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-abi")
-endif()
-
-if (NOT(ARCH_IA32 AND RELEASE_BUILD))
-    set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -fno-omit-frame-pointer")
-    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -fno-omit-frame-pointer")
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-maybe-uninitialized -Wno-uninitialized")
 endif()

 CHECK_INCLUDE_FILES(unistd.h HAVE_UNISTD_H)
 CHECK_FUNCTION_EXISTS(posix_memalign HAVE_POSIX_MEMALIGN)
 CHECK_FUNCTION_EXISTS(_aligned_malloc HAVE__ALIGNED_MALLOC)

+if(FREEBSD OR NETBSD)
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -gdwarf-4")
+endif()
+
+if(NETBSD)
+    set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -DHAVE_BUILTIN_POPCOUNT")
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -DHAVE_BUILTIN_POPCOUNT")
+endif()
+
+if(MACOSX)
+    # Boost headers cause such complains on MacOS
+    set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-deprecated-declarations -Wno-unused-parameter")
+    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-deprecated-declarations -Wno-unused-parameter")
+endif()
+
 # these end up in the config file
 CHECK_C_COMPILER_FLAG(-fvisibility=hidden HAS_C_HIDDEN)
 CHECK_CXX_COMPILER_FLAG(-fvisibility=hidden HAS_CXX_HIDDEN)
@ -71,94 +66,41 @@ if (NOT CMAKE_COMPILER_IS_CLANG)
   CHECK_C_SOURCE_COMPILES("int main(void) { __builtin_constant_p(0); }" HAVE__BUILTIN_CONSTANT_P)
 endif()

-set(C_FLAGS_TO_CHECK
-# Variable length arrays are way bad, most especially at run time
-"-Wvla"
-# Pointer arith on void pointers is doing it wrong.
- "-Wpointer-arith"
-# Build our C code with -Wstrict-prototypes -Wmissing-prototypes
- "-Wstrict-prototypes"
- "-Wmissing-prototypes"
-)
-foreach (FLAG ${C_FLAGS_TO_CHECK})
-    # munge the name so it doesn't break things
-    string(REPLACE "-" "_" FNAME C_FLAG${FLAG})
-    CHECK_C_COMPILER_FLAG("${FLAG}" ${FNAME})
-    if (${FNAME})
-        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} ${FLAG}")
-    endif()
-endforeach()
-
-# self-assign should be thrown away, but clang whinges
-CHECK_C_COMPILER_FLAG("-Wself-assign" CC_SELF_ASSIGN)
-if (CC_SELF_ASSIGN)
-    set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-self-assign")
-endif()
-CHECK_CXX_COMPILER_FLAG("-Wself-assign" CXX_SELF_ASSIGN)
-if (CXX_SELF_ASSIGN)
-    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-self-assign")
-endif()
-
-# clang gets up in our face for going paren crazy with macros
-CHECK_C_COMPILER_FLAG("-Wparentheses-equality" CC_PAREN_EQUALITY)
-if (CC_PAREN_EQUALITY)
-    set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-parentheses-equality")
-endif()
-
-# clang complains about unused const vars in our Ragel-generated code.
-CHECK_CXX_COMPILER_FLAG("-Wunused-const-variable" CXX_UNUSED_CONST_VAR)
-if (CXX_UNUSED_CONST_VAR)
-    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-unused-const-variable")
-endif()
-
 # clang-14 complains about unused-but-set variable.
 CHECK_CXX_COMPILER_FLAG("-Wunused-but-set-variable" CXX_UNUSED_BUT_SET_VAR)
 if (CXX_UNUSED_BUT_SET_VAR)
    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-unused-but-set-variable")
 endif()

-# clang-14 complains about using bitwise operator instead of logical ones.
-CHECK_CXX_COMPILER_FLAG("-Wbitwise-instead-of-logical" CXX_BITWISE_INSTEAD_OF_LOGICAL)
-if (CXX_BITWISE_INSTEAD_OF_LOGICAL)
-    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-bitwise-instead-of-logical")
-endif()
-
-# clang-14 complains about using bitwise operator instead of logical ones.
-CHECK_CXX_COMPILER_FLAG("-Wbitwise-instead-of-logical" CXX_BITWISE_INSTEAD_OF_LOGICAL)
-if (CXX_BITWISE_INSTEAD_OF_LOGICAL)
-    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-bitwise-instead-of-logical")
-endif()
-
 CHECK_CXX_COMPILER_FLAG("-Wignored-attributes" CXX_IGNORED_ATTR)
-if (CXX_IGNORED_ATTR)
-    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-ignored-attributes")
+if(CMAKE_COMPILER_IS_GNUCC)
+    if (CXX_IGNORED_ATTR)
+        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-ignored-attributes")
+    endif()
 endif()

-# gcc 9 complains about redundant move for returned variable
-CHECK_CXX_COMPILER_FLAG("-Wredundant-move" CXX_REDUNDANT_MOVE)
-if (CXX_REDUNDANT_MOVE)
-    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-redundant-move")
+CHECK_CXX_COMPILER_FLAG("-Wignored-attributes" CXX_NON_NULL)
+if(CMAKE_COMPILER_IS_GNUCC)
+    if (CXX_NON_NULL)
+        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-nonnull")
+    endif()
 endif()

 # note this for later, g++ doesn't have this flag but clang does
 CHECK_CXX_COMPILER_FLAG("-Wweak-vtables" CXX_WEAK_VTABLES)
-if (CXX_WEAK_VTABLES)
-    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wweak-vtables")
-endif()

 CHECK_CXX_COMPILER_FLAG("-Wmissing-declarations" CXX_MISSING_DECLARATIONS)
-if (CXX_MISSING_DECLARATIONS)
-    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wmissing-declarations")
-endif()

 CHECK_CXX_COMPILER_FLAG("-Wunused-local-typedefs" CXX_UNUSED_LOCAL_TYPEDEFS)

 CHECK_CXX_COMPILER_FLAG("-Wunused-variable" CXX_WUNUSED_VARIABLE)

-# gcc 10 complains about this
-CHECK_C_COMPILER_FLAG("-Wstringop-overflow" CC_STRINGOP_OVERFLOW)
-CHECK_CXX_COMPILER_FLAG("-Wstringop-overflow" CXX_STRINGOP_OVERFLOW)
-if(CC_STRINGOP_OVERFLOW OR CXX_STRINGOP_OVERFLOW)
-    set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-stringop-overflow")
-    set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-stringop-overflow")
+# gcc complains about this
+if(CMAKE_COMPILER_IS_GNUCC)
+    CHECK_C_COMPILER_FLAG("-Wstringop-overflow" CC_STRINGOP_OVERFLOW)
+    CHECK_CXX_COMPILER_FLAG("-Wstringop-overflow" CXX_STRINGOP_OVERFLOW)
+    if(CC_STRINGOP_OVERFLOW OR CXX_STRINGOP_OVERFLOW)
+        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-stringop-overflow -Wno-stringop-overread")
+        set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-stringop-overflow -Wno-stringop-overread")
+    endif()
 endif()
--- a/cmake/cflags-ppc64le.cmake
+++ b/cmake/cflags-ppc64le.cmake
@ -16,3 +16,12 @@ int main() {
 if (NOT HAVE_VSX)
    message(FATAL_ERROR "VSX support required for Power support")
 endif ()
+
+# fix unit-internal seg fault for freebsd and gcc13
+if (FREEBSD AND CMAKE_COMPILER_IS_GNUCXX)
+    if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "13")
+        set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libgcc -static-libstdc++")
+        set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -static-libgcc -static-libstdc++")
+        set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -static-libgcc -static-libstdc++")
+    endif ()
+endif ()
--- a/cmake/cflags-x86.cmake
+++ b/cmake/cflags-x86.cmake
@ -1,33 +1,42 @@
 option(BUILD_AVX512 "Enabling support for AVX512" OFF)
 option(BUILD_AVX512VBMI "Enabling support for AVX512VBMI" OFF)

-set(SKYLAKE_FLAG "-march=skylake-avx512")
-set(ICELAKE_FLAG "-march=icelake-server")
+set(SKYLAKE_ARCH "skylake-avx512")
+set(ICELAKE_ARCH "icelake-server")
+set(SKYLAKE_FLAG "-march=${SKYLAKE_ARCH}")
+set(ICELAKE_FLAG "-march=${ICELAKE_ARCH}")

 if (NOT FAT_RUNTIME)
    if (BUILD_AVX512VBMI)
        message (STATUS "AVX512VBMI implies AVX512, enabling BUILD_AVX512")
        set(BUILD_AVX512 ON)
+        set(BUILD_AVX2 ON)
        set(ARCH_C_FLAGS "${ICELAKE_FLAG}")
        set(ARCH_CXX_FLAGS "${ICELAKE_FLAG}")
-    endif ()
-    if (BUILD_AVX512)
+        set(X86_ARCH "${ICELAKE_ARCH}")
+    elseif (BUILD_AVX512)
        message (STATUS "AVX512 implies AVX2, enabling BUILD_AVX2")
        set(BUILD_AVX2 ON)
        set(ARCH_C_FLAGS "${SKYLAKE_FLAG}")
        set(ARCH_CXX_FLAGS "${SKYLAKE_FLAG}")
-    endif ()
-    if (BUILD_AVX2)
+        set(X86_ARCH "${SKYLAKE_ARCH}")
+    elseif (BUILD_AVX2)
        message (STATUS "Enabling BUILD_AVX2")
        set(ARCH_C_FLAGS "-mavx2")
        set(ARCH_CXX_FLAGS "-mavx2")
+        set(X86_ARCH "core-avx2")
    else()
        set(ARCH_C_FLAGS "-msse4.2")
        set(ARCH_CXX_FLAGS "-msse4.2")
+        set(X86_ARCH "x86-64-v2")
    endif()
 else()
+    set(BUILD_AVX512VBMI ON)
+    set(BUILD_AVX512 ON)
+    set(BUILD_AVX2 ON)
    set(ARCH_C_FLAGS "-msse4.2")
    set(ARCH_CXX_FLAGS "-msse4.2")
+    set(X86_ARCH "x86-64-v2")
 endif()

 set(CMAKE_REQUIRED_FLAGS "${ARCH_C_FLAGS}")
@ -129,5 +138,3 @@ else (NOT FAT_RUNTIME)
        message(FATAL_ERROR "A minimum of SSE4.2 compiler support is required")
    endif ()
 endif ()
-
-
--- a/cmake/compiler.cmake
+++ b/cmake/compiler.cmake
@ -6,6 +6,7 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
    if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS CLANGCXX_MINVER)
        message(FATAL_ERROR "A minimum of clang++ ${CLANGCXX_MINVER} is required for C++17 support")
    endif()
+    string (REGEX REPLACE "^([0-9]+)\\.([0-9]+)\\.([0-9]+)$" "\\1" CLANG_MAJOR_VERSION "${CMAKE_CXX_COMPILER_VERSION}")
 endif()

 # compiler version checks TODO: test more compilers
--- a/cmake/osdetection.cmake
+++ b/cmake/osdetection.cmake
@ -4,27 +4,43 @@ endif(CMAKE_SYSTEM_NAME MATCHES "Linux")

 if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
    set(FREEBSD true)
+    set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
+    #FIXME: find a nicer and more general way of doing this
+    if(CMAKE_C_COMPILER MATCHES "/usr/local/bin/gcc13")
+        set(CMAKE_BUILD_RPATH "/usr/local/lib/gcc13")
+    elseif(ARCH_AARCH64 AND (CMAKE_C_COMPILER MATCHES "/usr/local/bin/gcc12"))
+        set(CMAKE_BUILD_RPATH "/usr/local/lib/gcc12")
+    endif()
 endif(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")

-option(FAT_RUNTIME "Build a library that supports multiple microarchitectures" OFF)
-message("Checking Fat Runtime Requirements...")
-if (FAT_RUNTIME AND NOT LINUX)
-    message(FATAL_ERROR "Fat runtime is only supported on Linux OS")
+if(CMAKE_SYSTEM_NAME MATCHES "NetBSD")
+    set(NETBSD true)
+endif(CMAKE_SYSTEM_NAME MATCHES "NetBSD")
+
+if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+    set(MACOSX TRUE)
 endif()

-if (USE_CPU_NATIVE AND FAT_RUNTIME)
-    message(FATAL_ERROR "Fat runtime is not compatible with Native CPU detection")
+if (ARCH_IA32 OR ARCH_X86_64)
+  option(FAT_RUNTIME "Build a library that supports multiple microarchitectures" ON)
+else()
+  option(FAT_RUNTIME "Build a library that supports multiple microarchitectures" OFF)
 endif()

-if (FAT_RUNTIME AND LINUX)
+if (FAT_RUNTIME)
+    message("Checking Fat Runtime Requirements...")
+    if (USE_CPU_NATIVE AND FAT_RUNTIME)
+        message(FATAL_ERROR "Fat runtime is not compatible with Native CPU detection")
+    endif()
+
    if (NOT (ARCH_IA32 OR ARCH_X86_64 OR ARCH_AARCH64))
        message(FATAL_ERROR "Fat runtime is only supported on Intel and Aarch64 architectures")
    else()
        message(STATUS "Building Fat runtime for multiple microarchitectures")
-	message(STATUS "generator is ${CMAKE_GENERATOR}")
+        message(STATUS "generator is ${CMAKE_GENERATOR}")
        if (NOT (CMAKE_GENERATOR MATCHES "Unix Makefiles" OR
            (CMAKE_VERSION VERSION_GREATER "3.0" AND CMAKE_GENERATOR MATCHES "Ninja")))
-	    message (FATAL_ERROR "Building the fat runtime requires the Unix Makefiles generator, or Ninja with CMake v3.0 or higher")
+            message (FATAL_ERROR "Building the fat runtime requires the Unix Makefiles generator, or Ninja with CMake v3.0 or higher")
        else()
            include (${CMAKE_MODULE_PATH}/attrib.cmake)
            if (NOT HAS_C_ATTR_IFUNC)
@ -36,5 +52,3 @@ if (FAT_RUNTIME AND LINUX)
        message(FATAL_ERROR "Fat runtime is only built on Release builds")
    endif()
 endif ()
-
-
--- a/cmake/pcre.cmake
+++ b/cmake/pcre.cmake
@ -30,7 +30,7 @@ if (PCRE_BUILD_SOURCE)
    #if PCRE_MAJOR != ${PCRE_REQUIRED_MAJOR_VERSION} || PCRE_MINOR < ${PCRE_REQUIRED_MINOR_VERSION}
    #error Incorrect pcre version
    #endif
-    main() {}" CORRECT_PCRE_VERSION)
+    int main(void) {return 0;}" CORRECT_PCRE_VERSION)
    set (CMAKE_REQUIRED_INCLUDES "${saved_INCLUDES}")

    if (NOT CORRECT_PCRE_VERSION)
--- a/cmake/simde.cmake
+++ b/cmake/simde.cmake
@ -0,0 +1,40 @@
+LIST(APPEND CMAKE_REQUIRED_INCLUDES ${PROJECT_SOURCE_DIR}/simde)
+
+CHECK_INCLUDE_FILES(simde/x86/sse4.2.h SIMDE_SSE42_H_FOUND)
+
+if (SIMDE_SSE42_H_FOUND)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DVS_SIMDE_BACKEND")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DVS_SIMDE_BACKEND")
+  include_directories(${PROJECT_SOURCE_DIR}/simde)
+
+  if (CMAKE_COMPILER_IS_CLANG)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DSIMDE_NO_CHECK_IMMEDIATE_CONSTANT")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSIMDE_NO_CHECK_IMMEDIATE_CONSTANT")
+    if (ARCH_PPC64EL)
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-deprecated-altivec-src-compat")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-altivec-src-compat")
+	if (CLANG_MAJOR_VERSION EQUAL 15)
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-deprecate-lax-vec-conv-all")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecate-lax-vec-conv-all")
+        endif ()
+    endif()
+  endif()
+
+  if (BUILD_SSE2_SIMDE)
+    message("using BUILD_SSE2_SIMDE..")
+    set(SIMDE_NATIVE true)
+    set(ARCH_C_FLAGS "-msse2")
+    set(ARCH_CXX_FLAGS "-msse2")
+    set(X86_ARCH "x86-64")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DVS_SIMDE_NATIVE -DVS_SIMDE_BACKEND")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DVS_SIMDE_NATIVE -DVS_SIMDE_BACKEND")
+  endif()
+
+  if (SIMDE_NATIVE AND NOT BUILD_SSE2_SIMDE)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DVS_SIMDE_NATIVE -DSIMDE_ENABLE_OPENMP -fopenmp-simd")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DVS_SIMDE_NATIVE -DSIMDE_ENABLE_OPENMP -fopenmp-simd")
+  endif()
+
+else()
+  message(FATAL_ERROR "SIMDe backend requested but SIMDe is not available on the system")
+endif()
--- a/cmake/sqlite3.cmake
+++ b/cmake/sqlite3.cmake
@ -1,15 +1,9 @@
 #
-# a lot of noise to find sqlite
+# sqlite is only used in hsbench, no need to special case its build, depend only on OS installations using pkg-config
 #

-option(SQLITE_PREFER_STATIC "Build sqlite3 statically instead of using an installed lib" OFF)
-
-if(NOT SQLITE_PREFER_STATIC)
-find_package(PkgConfig QUIET)
-
 # first check for sqlite on the system
 pkg_check_modules(SQLITE3 sqlite3)
-endif()

 # now do version checks
 if (SQLITE3_FOUND)
@ -17,20 +11,9 @@ if (SQLITE3_FOUND)
    if (SQLITE_VERSION LESS "3.8.10")
        message(FATAL_ERROR "sqlite3 is broken from 3.8.7 to 3.8.10 - please find a working version")
    endif()
-endif()

-if (NOT SQLITE3_BUILD_SOURCE)
-    set(_SAVED_FLAGS ${CMAKE_REQUIRED_FLAGS})
    list(INSERT CMAKE_REQUIRED_LIBRARIES 0 ${SQLITE3_LDFLAGS})
    CHECK_SYMBOL_EXISTS(sqlite3_open_v2 sqlite3.h HAVE_SQLITE3_OPEN_V2)
    list(REMOVE_ITEM CMAKE_REQUIRED_INCLUDES "${SQLITE3_INCLUDE_DIRS}")
    list(REMOVE_ITEM CMAKE_REQUIRED_LIBRARIES ${SQLITE3_LDFLAGS})
-else()
-    if (NOT TARGET sqlite3_static)
-    # build sqlite as a static lib to compile into our test programs
-    add_library(sqlite3_static STATIC "${PROJECT_SOURCE_DIR}/sqlite3/sqlite3.c")
-    set_target_properties(sqlite3_static PROPERTIES COMPILE_FLAGS "-Wno-error -Wno-extra -Wno-unused -Wno-cast-qual -DSQLITE_OMIT_LOAD_EXTENSION")
-    endif()
 endif()
-
-# that's enough about sqlite
--- a/cppcheck-suppression-list.txt
+++ b/cppcheck-suppression-list.txt
@ -0,0 +1,15 @@
+unknownMacro:*gtest-all.cc
+knownConditionTrueFalse:*Parser.rl
+knownConditionTrueFalse:*Parser.cpp
+variableScope:*Parser.rl
+duplicateBreak:*.rl
+unreadVariable:*control_verbs.cpp
+unreachableCode:*rose_build_dump.cpp
+*:*simde/*
+assertWithSideEffect
+syntaxError
+internalError
+checkersReport
+missingInclude
+missingIncludeSystem
+unmatchedSuppression
--- a/doc/dev-reference/CMakeLists.txt
+++ b/doc/dev-reference/CMakeLists.txt
@ -19,6 +19,7 @@ else()
 set(SPHINX_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/_build")
 set(SPHINX_CACHE_DIR "${CMAKE_CURRENT_BINARY_DIR}/_doctrees")
 set(SPHINX_HTML_DIR "${CMAKE_CURRENT_BINARY_DIR}/html")
+set(SPHINX_MAN_DIR "${CMAKE_CURRENT_BINARY_DIR}/man")

 configure_file("${CMAKE_CURRENT_SOURCE_DIR}/conf.py.in"
    "${CMAKE_CURRENT_BINARY_DIR}/conf.py" @ONLY)
@ -32,4 +33,14 @@ add_custom_target(dev-reference
        "${SPHINX_HTML_DIR}"
    DEPENDS dev-reference-doxygen
    COMMENT "Building HTML dev reference with Sphinx")
+
+add_custom_target(dev-reference-man
+    ${SPHINX_BUILD}
+        -b man
+        -c "${CMAKE_CURRENT_BINARY_DIR}"
+        -d "${SPHINX_CACHE_DIR}"
+        "${CMAKE_CURRENT_SOURCE_DIR}"
+        "${SPHINX_MAN_DIR}"
+    DEPENDS dev-reference-doxygen
+    COMMENT "Building man page reference with Sphinx")
 endif()
--- a/doc/dev-reference/chimera.rst
+++ b/doc/dev-reference/chimera.rst
@ -11,10 +11,10 @@ Introduction
 ************

 Chimera is a software regular expression matching engine that is a hybrid of
-Hyperscan and PCRE. The design goals of Chimera are to fully support PCRE
-syntax as well as to take advantage of the high performance nature of Hyperscan.
+Vectorscan and PCRE. The design goals of Chimera are to fully support PCRE
+syntax as well as to take advantage of the high performance nature of Vectorscan.

-Chimera inherits the design guideline of Hyperscan with C APIs for compilation
+Chimera inherits the design guideline of Vectorscan with C APIs for compilation
 and scanning.

 The Chimera API itself is composed of two major components:
@ -65,13 +65,13 @@ For a given database, Chimera provides several guarantees:
 .. note:: Chimera is designed to have the same matching behavior as PCRE,
   including greedy/ungreedy, capturing, etc. Chimera reports both
   **start offset** and **end offset** for each match like PCRE. Different
-   from the fashion of reporting all matches in Hyperscan, Chimera only reports
+   from the fashion of reporting all matches in Vectorscan, Chimera only reports
   non-overlapping matches. For example, the pattern :regexp:`/foofoo/` will
   match ``foofoofoofoo`` at offsets (0, 6) and (6, 12).

-.. note:: Since Chimera is a hybrid of Hyperscan and PCRE in order to support
+.. note:: Since Chimera is a hybrid of Vectorscan and PCRE in order to support
   full PCRE syntax, there will be extra performance overhead compared to
-   Hyperscan-only solution. Please always use Hyperscan for better performance
+   Vectorscan-only solution. Please always use Vectorscan for better performance
   unless you must need full PCRE syntax support.

 See :ref:`chruntime` for more details
@ -83,12 +83,12 @@ Requirements
 The PCRE library (http://pcre.org/) version 8.41 is required for Chimera.

 .. note:: Since Chimera needs to reference PCRE internal function, please place PCRE source
-   directory under Hyperscan root directory in order to build Chimera.
+   directory under Vectorscan root directory in order to build Chimera.

-Beside this, both hardware and software requirements of Chimera are the same to Hyperscan.
+Beside this, both hardware and software requirements of Chimera are the same to Vectorscan.
 See :ref:`hardware` and :ref:`software` for more details.

-.. note:: Building Hyperscan will automatically generate Chimera library.
+.. note:: Building Vectorscan will automatically generate Chimera library.
   Currently only static library is supported for Chimera, so please
   use static build type when configure CMake build options.

@ -119,7 +119,7 @@ databases:

 Compilation allows the Chimera library to analyze the given pattern(s) and
 pre-determine how to scan for these patterns in an optimized fashion using
-Hyperscan and PCRE.
+Vectorscan and PCRE.

 ===============
 Pattern Support
@ -134,7 +134,7 @@ Semantics
 =========

 Chimera supports the exact same semantics of PCRE library. Moreover, it supports
-multiple simultaneous pattern matching like Hyperscan and the multiple matches
+multiple simultaneous pattern matching like Vectorscan and the multiple matches
 will be reported in order by end offset.

 .. _chruntime:
--- a/doc/dev-reference/compilation.rst
+++ b/doc/dev-reference/compilation.rst
@ -9,7 +9,7 @@ Compiling Patterns
 Building a Database
 *******************

-The Hyperscan compiler API accepts regular expressions and converts them into a
+The Vectorscan compiler API accepts regular expressions and converts them into a
 compiled pattern database that can then be used to scan data.

 The API provides three functions that compile regular expressions into
@ -24,7 +24,7 @@ databases:
 #. :c:func:`hs_compile_ext_multi`: compiles an array of expressions as above,
   but allows :ref:`extparam` to be specified for each expression.

-Compilation allows the Hyperscan library to analyze the given pattern(s) and
+Compilation allows the Vectorscan library to analyze the given pattern(s) and
 pre-determine how to scan for these patterns in an optimized fashion that would
 be far too expensive to compute at run-time.

@ -48,10 +48,10 @@ To compile patterns to be used in streaming mode, the ``mode`` parameter of
 block mode requires the use of :c:member:`HS_MODE_BLOCK` and vectored mode
 requires the use of :c:member:`HS_MODE_VECTORED`. A pattern database compiled
 for one mode (streaming, block or vectored) can only be used in that mode. The
-version of Hyperscan used to produce a compiled pattern database must match the
-version of Hyperscan used to scan with it.
+version of Vectorscan used to produce a compiled pattern database must match the
+version of Vectorscan used to scan with it.

-Hyperscan provides support for targeting a database at a particular CPU
+Vectorscan provides support for targeting a database at a particular CPU
 platform; see :ref:`instr_specialization` for details.

 =====================
@ -75,14 +75,14 @@ characters exist in regular grammar like ``[``, ``]``, ``(``, ``)``, ``{``,
 While in pure literal case, all these meta characters lost extra meanings
 expect for that they are just common ASCII codes.

-Hyperscan is initially designed to process common regular expressions. It is
+Vectorscan is initially designed to process common regular expressions. It is
 hence embedded with a complex parser to do comprehensive regular grammar
 interpretation. Particularly, the identification of above meta characters is the
 basic step for the interpretation of far more complex regular grammars.

 However in real cases, patterns may not always be regular expressions. They
 could just be pure literals. Problem will come if the pure literals contain
-regular meta characters. Supposing fed directly into traditional Hyperscan
+regular meta characters. Supposing fed directly into traditional Vectorscan
 compile API, all these meta characters will be interpreted in predefined ways,
 which is unnecessary and the result is totally out of expectation. To avoid
 such misunderstanding by traditional API, users have to preprocess these
@ -90,7 +90,7 @@ literal patterns by converting the meta characters into some other formats:
 either by adding a backslash ``\`` before certain meta characters, or by
 converting all the characters into a hexadecimal representation.

-In ``v5.2.0``, Hyperscan introduces 2 new compile APIs for pure literal patterns:
+In ``v5.2.0``, Vectorscan introduces 2 new compile APIs for pure literal patterns:

 #. :c:func:`hs_compile_lit`: compiles a single pure literal into a pattern
   database.
@ -106,7 +106,7 @@ content directly into these APIs without worrying about writing regular meta
 characters in their patterns. No preprocessing work is needed any more.

 For new APIs, the ``length`` of each literal pattern is a newly added parameter.
-Hyperscan needs to locate the end position of the input expression via clearly
+Vectorscan needs to locate the end position of the input expression via clearly
 knowing each literal's length, not by simply identifying character ``\0`` of a
 string.

@ -127,19 +127,19 @@ Supported flags: :c:member:`HS_FLAG_CASELESS`, :c:member:`HS_FLAG_SINGLEMATCH`,
 Pattern Support
 ***************

-Hyperscan supports the pattern syntax used by the PCRE library ("libpcre"),
+Vectorscan supports the pattern syntax used by the PCRE library ("libpcre"),
 described at <http://www.pcre.org/>. However, not all constructs available in
 libpcre are supported. The use of unsupported constructs will result in
 compilation errors.

-The version of PCRE used to validate Hyperscan's interpretation of this syntax
+The version of PCRE used to validate Vectorscan's interpretation of this syntax
 is 8.41 or above.

 ====================
 Supported Constructs
 ====================

-The following regex constructs are supported by Hyperscan:
+The following regex constructs are supported by Vectorscan:

 * Literal characters and strings, with all libpcre quoting and character
  escapes.
@ -177,7 +177,7 @@ The following regex constructs are supported by Hyperscan:
      :c:member:`HS_FLAG_SINGLEMATCH` flag is on for that pattern.

  * Lazy modifiers (:regexp:`?` appended to another quantifier, e.g.
-    :regexp:`\\w+?`) are supported but ignored (as Hyperscan reports all
+    :regexp:`\\w+?`) are supported but ignored (as Vectorscan reports all
    matches).

 * Parenthesization, including the named and unnamed capturing and
@ -219,15 +219,15 @@ The following regex constructs are supported by Hyperscan:
 .. note:: At this time, not all patterns can be successfully compiled with the
  :c:member:`HS_FLAG_SOM_LEFTMOST` flag, which enables per-pattern support for
  :ref:`som`. The patterns that support this flag are a subset of patterns that
-  can be successfully compiled with Hyperscan; notably, many bounded repeat
-  forms that can be compiled with Hyperscan without the Start of Match flag
+  can be successfully compiled with Vectorscan; notably, many bounded repeat
+  forms that can be compiled with Vectorscan without the Start of Match flag
  enabled cannot be compiled with the flag enabled.

 ======================
 Unsupported Constructs
 ======================

-The following regex constructs are not supported by Hyperscan:
+The following regex constructs are not supported by Vectorscan:

 * Backreferences and capturing sub-expressions.
 * Arbitrary zero-width assertions.
@ -246,32 +246,32 @@ The following regex constructs are not supported by Hyperscan:
 Semantics
 *********

-While Hyperscan follows libpcre syntax, it provides different semantics. The
+While Vectorscan follows libpcre syntax, it provides different semantics. The
 major departures from libpcre semantics are motivated by the requirements of
 streaming and multiple simultaneous pattern matching.

 The major departures from libpcre semantics are:

-#. **Multiple pattern matching**: Hyperscan allows matches to be reported for
+#. **Multiple pattern matching**: Vectorscan allows matches to be reported for
   several patterns simultaneously. This is not equivalent to separating the
   patterns by :regexp:`|` in libpcre, which evaluates alternations
   left-to-right.

-#. **Lack of ordering**: the multiple matches that Hyperscan produces are not
+#. **Lack of ordering**: the multiple matches that Vectorscan produces are not
   guaranteed to be ordered, although they will always fall within the bounds of
   the current scan.

-#. **End offsets only**: Hyperscan's default behaviour is only to report the end
+#. **End offsets only**: Vectorscan's default behaviour is only to report the end
   offset of a match. Reporting of the start offset can be enabled with
   per-expression flags at pattern compile time. See :ref:`som` for details.

 #. **"All matches" reported**: scanning :regexp:`/foo.*bar/` against
-   ``fooxyzbarbar`` will return two matches from Hyperscan -- at the points
+   ``fooxyzbarbar`` will return two matches from Vectorscan -- at the points
   corresponding to the ends of ``fooxyzbar`` and ``fooxyzbarbar``. In contrast,
   libpcre semantics by default would report only one match at ``fooxyzbarbar``
   (greedy semantics) or, if non-greedy semantics were switched on, one match at
   ``fooxyzbar``. This means that switching between greedy and non-greedy
-   semantics is a no-op in Hyperscan.
+   semantics is a no-op in Vectorscan.

 To support libpcre quantifier semantics while accurately reporting streaming
 matches at the time they occur is impossible. For example, consider the pattern
@ -299,7 +299,7 @@ as in block 3 -- which would constitute a better match for the pattern.
 Start of Match
 ==============

-In standard operation, Hyperscan will only provide the end offset of a match
+In standard operation, Vectorscan will only provide the end offset of a match
 when the match callback is called. If the :c:member:`HS_FLAG_SOM_LEFTMOST` flag
 is specified for a particular pattern, then the same set of matches is
 returned, but each match will also provide the leftmost possible start offset
@ -308,7 +308,7 @@ corresponding to its end offset.
 Using the SOM flag entails a number of trade-offs and limitations:

 * Reduced pattern support: For many patterns, tracking SOM is complex and can
-  result in Hyperscan failing to compile a pattern with a "Pattern too
+  result in Vectorscan failing to compile a pattern with a "Pattern too
  large" error, even if the pattern is supported in normal operation.
 * Increased stream state: At scan time, state space is required to track
  potential SOM offsets, and this must be stored in persistent stream state in
@ -316,20 +316,20 @@ Using the SOM flag entails a number of trade-offs and limitations:
  required to match a pattern.
 * Performance overhead: Similarly, there is generally a performance cost
  associated with tracking SOM.
-* Incompatible features: Some other Hyperscan pattern flags (such as
+* Incompatible features: Some other Vectorscan pattern flags (such as
  :c:member:`HS_FLAG_SINGLEMATCH` and :c:member:`HS_FLAG_PREFILTER`) can not be
  used in combination with SOM. Specifying them together with
  :c:member:`HS_FLAG_SOM_LEFTMOST` will result in a compilation error.

 In streaming mode, the amount of precision delivered by SOM can be controlled
-with the SOM horizon flags. These instruct Hyperscan to deliver accurate SOM
+with the SOM horizon flags. These instruct Vectorscan to deliver accurate SOM
 information within a certain distance of the end offset, and return a special
 start offset of :c:member:`HS_OFFSET_PAST_HORIZON` otherwise. Specifying a
 small or medium SOM horizon will usually reduce the stream state required for a
 given database.

 .. note:: In streaming mode, the start offset returned for a match may refer to
-   a point in the stream *before* the current block being scanned. Hyperscan
+   a point in the stream *before* the current block being scanned. Vectorscan
   provides no facility for accessing earlier blocks; if the calling application
   needs to inspect historical data, then it must store it itself.

@ -341,7 +341,7 @@ Extended Parameters

 In some circumstances, more control over the matching behaviour of a pattern is
 required than can be specified easily using regular expression syntax. For
-these scenarios, Hyperscan provides the :c:func:`hs_compile_ext_multi` function
+these scenarios, Vectorscan provides the :c:func:`hs_compile_ext_multi` function
 that allows a set of "extended parameters" to be set on a per-pattern basis.

 Extended parameters are specified using an :c:type:`hs_expr_ext_t` structure,
@ -383,18 +383,18 @@ section.
 Prefiltering Mode
 =================

-Hyperscan provides a per-pattern flag, :c:member:`HS_FLAG_PREFILTER`, which can
-be used to implement a prefilter for a pattern than Hyperscan would not
+Vectorscan provides a per-pattern flag, :c:member:`HS_FLAG_PREFILTER`, which can
+be used to implement a prefilter for a pattern than Vectorscan would not
 ordinarily support.

-This flag instructs Hyperscan to compile an "approximate" version of this
-pattern for use in a prefiltering application, even if Hyperscan does not
+This flag instructs Vectorscan to compile an "approximate" version of this
+pattern for use in a prefiltering application, even if Vectorscan does not
 support the pattern in normal operation.

 The set of matches returned when this flag is used is guaranteed to be a
 superset of the matches specified by the non-prefiltering expression.

-If the pattern contains pattern constructs not supported by Hyperscan (such as
+If the pattern contains pattern constructs not supported by Vectorscan (such as
 zero-width assertions, back-references or conditional references) these
 constructs will be replaced internally with broader constructs that may match
 more often.
@ -404,7 +404,7 @@ back-reference :regexp:`\\1`. In prefiltering mode, this pattern might be
 approximated by having its back-reference replaced with its referent, forming
 :regexp:`/\\w+ again \\w+/`.

-Furthermore, in prefiltering mode Hyperscan may simplify a pattern that would
+Furthermore, in prefiltering mode Vectorscan may simplify a pattern that would
 otherwise return a "Pattern too large" error at compile time, or for performance
 reasons (subject to the matching guarantee above).

@ -422,22 +422,22 @@ matches for the pattern.
 Instruction Set Specialization
 ******************************

-Hyperscan is able to make use of several modern instruction set features found
+Vectorscan is able to make use of several modern instruction set features found
 on x86 processors to provide improvements in scanning performance.

 Some of these features are selected when the library is built; for example,
-Hyperscan will use the native ``POPCNT`` instruction on processors where it is
+Vectorscan will use the native ``POPCNT`` instruction on processors where it is
 available and the library has been optimized for the host architecture.

-.. note:: By default, the Hyperscan runtime is built with the ``-march=native``
+.. note:: By default, the Vectorscan runtime is built with the ``-march=native``
   compiler flag and (where possible) will make use of all instructions known by
   the host's C compiler.

-To use some instruction set features, however, Hyperscan must build a
+To use some instruction set features, however, Vectorscan must build a
 specialized database to support them. This means that the target platform must
 be specified at pattern compile time.

-The Hyperscan compiler API functions all accept an optional
+The Vectorscan compiler API functions all accept an optional
 :c:type:`hs_platform_info_t` argument, which describes the target platform
 for the database to be built. If this argument is NULL, the database will be
 targeted at the current host platform.
@ -467,7 +467,7 @@ See :ref:`api_constants` for the full list of CPU tuning and feature flags.
 Approximate matching
 ********************

-Hyperscan provides an experimental approximate matching mode, which will match
+Vectorscan provides an experimental approximate matching mode, which will match
 patterns within a given edit distance. The exact matching behavior is defined as
 follows:

@ -492,7 +492,7 @@ follows:

 Here are a few examples of approximate matching:

-* Pattern :regexp:`/foo/` can match ``foo`` when using regular Hyperscan
+* Pattern :regexp:`/foo/` can match ``foo`` when using regular Vectorscan
  matching behavior. With approximate matching within edit distance 2, the
  pattern will produce matches when scanned against ``foo``, ``foooo``, ``f00``,
  ``f``, and anything else that lies within edit distance 2 of matching corpora
@ -513,7 +513,7 @@ matching support. Here they are, in a nutshell:
 * Reduced pattern support:

  * For many patterns, approximate matching is complex and can result in
-    Hyperscan failing to compile a pattern with a "Pattern too large" error,
+    Vectorscan failing to compile a pattern with a "Pattern too large" error,
    even if the pattern is supported in normal operation.
  * Additionally, some patterns cannot be approximately matched because they
    reduce to so-called "vacuous" patterns (patterns that match everything). For
@ -548,7 +548,7 @@ Logical Combinations
 ********************

 For situations when a user requires behaviour that depends on the presence or
-absence of matches from groups of patterns, Hyperscan provides support for the
+absence of matches from groups of patterns, Vectorscan provides support for the
 logical combination of patterns in a given pattern set, with three operators:
 ``NOT``, ``AND`` and ``OR``.

@ -561,7 +561,7 @@ offset is *true* if the expression it refers to is *false* at this offset.
 For example, ``NOT 101`` means that expression 101 has not yet matched at this
 offset.

-A logical combination is passed to Hyperscan at compile time as an expression.
+A logical combination is passed to Vectorscan at compile time as an expression.
 This combination expression will raise matches at every offset where one of its
 sub-expressions matches and the logical value of the whole expression is *true*.

@ -603,7 +603,7 @@ In a logical combination expression:
 * Whitespace is ignored.

 To use a logical combination expression, it must be passed to one of the
-Hyperscan compile functions (:c:func:`hs_compile_multi`,
+Vectorscan compile functions (:c:func:`hs_compile_multi`,
 :c:func:`hs_compile_ext_multi`) along with the :c:member:`HS_FLAG_COMBINATION` flag,
 which identifies the pattern as a logical combination expression. The patterns
 referred to in the logical combination expression must be compiled together in
@ -613,7 +613,7 @@ When an expression has the :c:member:`HS_FLAG_COMBINATION` flag set, it ignores
 all other flags except the :c:member:`HS_FLAG_SINGLEMATCH` flag and the
 :c:member:`HS_FLAG_QUIET` flag.

-Hyperscan will accept logical combination expressions at compile time that
+Vectorscan will accept logical combination expressions at compile time that
 evaluate to *true* when no patterns have matched, and report the match for
 combination at end of data if no patterns have matched; for example: ::

--- a/doc/dev-reference/conf.py.in
+++ b/doc/dev-reference/conf.py.in
@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 #
-# Hyperscan documentation build configuration file, created by
+# Vectorscan documentation build configuration file, created by
 # sphinx-quickstart on Tue Sep 29 15:59:19 2015.
 #
 # This file is execfile()d with the current directory set to its
@ -43,8 +43,8 @@ source_suffix = '.rst'
 master_doc = 'index'

 # General information about the project.
-project = u'Hyperscan'
-copyright = u'2015-2018, Intel Corporation'
+project = u'Vectorscan'
+copyright = u'2015-2020, Intel Corporation; 2020-2024, VectorCamp; and other contributors'

 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
@ -202,7 +202,7 @@ latex_elements = {
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
-  ('index', 'Hyperscan.tex', u'Hyperscan Documentation',
+  ('index', 'Hyperscan.tex', u'Vectorscan Documentation',
   u'Intel Corporation', 'manual'),
 ]

@ -232,8 +232,8 @@ latex_documents = [
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
 man_pages = [
-    ('index', 'hyperscan', u'Hyperscan Documentation',
-     [u'Intel Corporation'], 1)
+    ('index', 'vectorscan', u'Vectorscan Documentation',
+     [u'Intel Corporation'], 7)
 ]

 # If true, show URL addresses after external links.
@ -246,8 +246,8 @@ man_pages = [
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
 texinfo_documents = [
-  ('index', 'Hyperscan', u'Hyperscan Documentation',
-   u'Intel Corporation', 'Hyperscan', 'High-performance regular expression matcher.',
+  ('index', 'Vectorscan', u'Vectorscan Documentation',
+   u'Intel Corporation; VectorCamp', 'Vectorscan', 'High-performance regular expression matcher.',
   'Miscellaneous'),
 ]

--- a/doc/dev-reference/getting_started.rst
+++ b/doc/dev-reference/getting_started.rst
@ -7,43 +7,41 @@ Getting Started
 Very Quick Start
 ****************

-#. Clone Hyperscan ::
+#. Clone Vectorscan ::

-     cd <where-you-want-hyperscan-source>
-     git clone git://github.com/intel/hyperscan
+     cd <where-you-want-vectorscan-source>
+     git clone https://github.com/VectorCamp/vectorscan

-#. Configure Hyperscan
+#. Configure Vectorscan

   Ensure that you have the correct :ref:`dependencies <software>` present,
   and then:

   ::

-     cd <where-you-want-to-build-hyperscan>
+     cd <where-you-want-to-build-vectorscan>
     mkdir <build-dir>
     cd <build-dir>
-     cmake [-G <generator>] [options] <hyperscan-source-path>
+     cmake [-G <generator>] [options] <vectorscan-source-path>

   Known working generators:
      * ``Unix Makefiles`` --- make-compatible makefiles (default on Linux/FreeBSD/Mac OS X)
      * ``Ninja`` --- `Ninja <http://martine.github.io/ninja/>`_ build files.
-      * ``Visual Studio 15 2017`` --- Visual Studio projects

-   Generators that might work include:
+   Unsupported generators that might work include:
      * ``Xcode`` --- OS X Xcode projects.

-#. Build Hyperscan
+#. Build Vectorscan

   Depending on the generator used:
     * ``cmake --build .`` --- will build everything
     * ``make -j<jobs>`` --- use makefiles in parallel
     * ``ninja`` --- use Ninja build
-     * ``MsBuild.exe`` --- use Visual Studio MsBuild
     * etc.

-#. Check Hyperscan
+#. Check Vectorscan

-   Run the Hyperscan unit tests: ::
+   Run the Vectorscan unit tests: ::

     bin/unit-hyperscan

@ -55,20 +53,23 @@ Requirements
 Hardware
 ========

-Hyperscan will run on x86 processors in 64-bit (Intel\ |reg| 64 Architecture) and
-32-bit (IA-32 Architecture) modes.
+Vectorscan will run on x86 processors in 64-bit (Intel\ |reg| 64 Architecture) and
+32-bit (IA-32 Architecture) modes as well as Arm v8.0+ aarch64, and POWER 8+ ppc64le
+machines.

 Hyperscan is a high performance software library that takes advantage of recent
-Intel architecture advances. At a minimum, support for Supplemental Streaming
-SIMD Extensions 3 (SSSE3) is required, which should be available on any modern
-x86 processor.
+architecture advances.

-Additionally, Hyperscan can make use of:
+Additionally, Vectorscan can make use of:

    * Intel Streaming SIMD Extensions 4.2 (SSE4.2)
    * the POPCNT instruction
    * Bit Manipulation Instructions (BMI, BMI2)
    * Intel Advanced Vector Extensions 2 (Intel AVX2)
+    * Arm NEON
+    * Arm SVE and SVE2
+    * Arm SVE2 BITPERM
+    * IBM Power8/Power9 VSX

 if present.

@ -79,40 +80,34 @@ These can be determined at library compile time, see :ref:`target_arch`.
 Software
 ========

-As a software library, Hyperscan doesn't impose any particular runtime
-software requirements, however to build the Hyperscan library we require a
-modern C and C++ compiler -- in particular, Hyperscan requires C99 and C++11
+As a software library, Vectorscan doesn't impose any particular runtime
+software requirements, however to build the Vectorscan library we require a
+modern C and C++ compiler -- in particular, Vectorscan requires C99 and C++17
 compiler support. The supported compilers are:

-    * GCC, v4.8.1 or higher
-    * Clang, v3.4 or higher (with libstdc++ or libc++)
-    * Intel C++ Compiler v15 or higher
-    * Visual C++ 2017 Build Tools
+    * GCC, v9 or higher
+    * Clang, v5 or higher (with libstdc++ or libc++)

-Examples of operating systems that Hyperscan is known to work on include:
+Examples of operating systems that Vectorscan is known to work on include:

 Linux:

-* Ubuntu 14.04 LTS or newer
+* Ubuntu 20.04 LTS or newer
 * RedHat/CentOS 7 or newer
+* Fedora 38 or newer
+* Debian 10

 FreeBSD:

 * 10.0 or newer

-Windows:
-
-* 8 or newer
-
 Mac OS X:

 * 10.8 or newer, using XCode/Clang

-Hyperscan *may* compile and run on other platforms, but there is no guarantee.
-We currently have experimental support for Windows using Intel C++ Compiler
-or Visual Studio 2017.
+Vectorscan *may* compile and run on other platforms, but there is no guarantee.

-In addition, the following software is required for compiling the Hyperscan library:
+In addition, the following software is required for compiling the Vectorscan library:

 ======================================================= =========== ======================================
 Dependency                                              Version     Notes
@ -132,20 +127,20 @@ Ragel, you may use Cygwin to build it from source.
 Boost Headers
 -------------

-Compiling Hyperscan depends on a recent version of the Boost C++ header
+Compiling Vectorscan depends on a recent version of the Boost C++ header
 library. If the Boost libraries are installed on the build machine in the
 usual paths, CMake will find them. If the Boost libraries are not installed,
 the location of the Boost source tree can be specified during the CMake
 configuration step using the ``BOOST_ROOT`` variable (described below).

 Another alternative is to put a copy of (or a symlink to) the boost
-subdirectory in ``<hyperscan-source-path>/include/boost``.
+subdirectory in ``<vectorscanscan-source-path>/include/boost``.

 For example: for the Boost-1.59.0 release: ::

-    ln -s boost_1_59_0/boost <hyperscan-source-path>/include/boost
+    ln -s boost_1_59_0/boost <vectorscan-source-path>/include/boost

-As Hyperscan uses the header-only parts of Boost, it is not necessary to
+As Vectorscan uses the header-only parts of Boost, it is not necessary to
 compile the Boost libraries.

 CMake Configuration
@ -168,11 +163,12 @@ Common options for CMake include:
 |                        | Valid options are Debug, Release, RelWithDebInfo,  |
 |                        | and MinSizeRel. Default is RelWithDebInfo.         |
 +------------------------+----------------------------------------------------+
-| BUILD_SHARED_LIBS      | Build Hyperscan as a shared library instead of     |
+| BUILD_SHARED_LIBS      | Build Vectorscan as a shared library instead of    |
 |                        | the default static library.                        |
+|                        | Default: Off                                       |
 +------------------------+----------------------------------------------------+
-| BUILD_STATIC_AND_SHARED| Build both static and shared Hyperscan libs.       |
-|                        | Default off.                                       |
+| BUILD_STATIC_LIBS      | Build Vectorscan as a static library.              |
+|                        | Default: On                                        |
 +------------------------+----------------------------------------------------+
 | BOOST_ROOT             | Location of Boost source tree.                     |
 +------------------------+----------------------------------------------------+
@ -180,12 +176,64 @@ Common options for CMake include:
 +------------------------+----------------------------------------------------+
 | FAT_RUNTIME            | Build the :ref:`fat runtime<fat_runtime>`. Default |
 |                        | true on Linux, not available elsewhere.            |
+|                        | Default: Off                                       |
+------------------------+----------------------------------------------------+
+| USE_CPU_NATIVE         | Native CPU detection is off by default, however it |
+|                        | is possible to build a performance-oriented non-fat|
+|                        | library tuned to your CPU.                         |
+|                        | Default: Off                                       |
+------------------------+----------------------------------------------------+
+| SANITIZE               | Use libasan sanitizer to detect possible bugs.     |
+|                        | Valid options are address, memory and undefined.   |
+------------------------+----------------------------------------------------+
+| SIMDE_BACKEND          | Enable SIMDe backend. If this is chosen all native |
+|                        | (SSE/AVX/AVX512/Neon/SVE/VSX) backends will be     |
+|                        | disabled and a SIMDe SSE4.2 emulation backend will |
+|                        | be enabled. This will enable Vectorscan to build   |
+|                        | and run on architectures without SIMD.             |
+|                        | Default: Off                                       |
+------------------------+----------------------------------------------------+
+| SIMDE_NATIVE           | Enable SIMDe native emulation of x86 SSE4.2        |
+|                        | intrinsics on the building platform. That is,      |
+|                        | SSE4.2 intrinsics will be emulated using Neon on   |
+|                        | an Arm platform, or VSX on a Power platform, etc.  |
+|                        | Default: Off                                       |
+------------------------+----------------------------------------------------+
+
+X86 platform specific options include:
+
+------------------------+----------------------------------------------------+
+| Variable               | Description                                        |
+========================+====================================================+
+| BUILD_AVX2             | Enable code for AVX2.                              |
+------------------------+----------------------------------------------------+
+| BUILD_AVX512           | Enable code for AVX512. Implies BUILD_AVX2.        |
+------------------------+----------------------------------------------------+
+| BUILD_AVX512VBMI       | Enable code for AVX512 with VBMI extension. Implies|
+|                        | BUILD_AVX512.                                      |
+------------------------+----------------------------------------------------+
+
+Arm platform specific options include:
+
+------------------------+----------------------------------------------------+
+| Variable               | Description                                        |
+========================+====================================================+
+| BUILD_SVE              | Enable code for SVE, like on AWS Graviton3 CPUs.   |
+|                        | Not much code is ported just for SVE , but enabling|
+|                        | SVE code production, does improve code generation, |
+|                        | see Benchmarks.                                    |
+------------------------+----------------------------------------------------+
+| BUILD_SVE2             | Enable code for SVE2, implies BUILD_SVE. Most      |
+|                        | non-Neon code is written for SVE2.                 |
+------------------------+----------------------------------------------------+
+| BUILD_SVE2_BITPERM     | Enable code for SVE2_BITPERM harwdare feature,     |
+|                        | implies BUILD_SVE2.                                |
 +------------------------+----------------------------------------------------+

 For example, to generate a ``Debug`` build: ::

    cd <build-dir>
-    cmake -DCMAKE_BUILD_TYPE=Debug <hyperscan-source-path>
+    cmake -DCMAKE_BUILD_TYPE=Debug <vectorscan-source-path>



@ -193,7 +241,7 @@ Build Type
 ----------

 CMake determines a number of features for a build based on the Build Type.
-Hyperscan defaults to ``RelWithDebInfo``, i.e. "release with debugging
+Vectorscan defaults to ``RelWithDebInfo``, i.e. "release with debugging
 information". This is a performance optimized build without runtime assertions
 but with debug symbols enabled.

@ -201,7 +249,7 @@ The other types of builds are:

 * ``Release``: as above, but without debug symbols
 * ``MinSizeRel``: a stripped release build
- * ``Debug``: used when developing Hyperscan. Includes runtime assertions
+ * ``Debug``: used when developing Vectorscan. Includes runtime assertions
   (which has a large impact on runtime performance), and will also enable
   some other build features like building internal unit
   tests.
@ -211,7 +259,7 @@ The other types of builds are:
 Target Architecture
 -------------------

-Unless using the :ref:`fat runtime<fat_runtime>`, by default Hyperscan will be
+Unless using the :ref:`fat runtime<fat_runtime>`, by default Vectorscan will be
 compiled to target the instruction set of the processor of the machine that
 being used for compilation. This is done via the use of ``-march=native``. The
 result of this means that a library built on one machine may not work on a
@ -223,7 +271,7 @@ CMake, or ``CMAKE_C_FLAGS`` and ``CMAKE_CXX_FLAGS`` on the CMake command line. F
 example, to set the instruction subsets up to ``SSE4.2`` using GCC 4.8: ::

    cmake -DCMAKE_C_FLAGS="-march=corei7" \
-      -DCMAKE_CXX_FLAGS="-march=corei7" <hyperscan-source-path>
+      -DCMAKE_CXX_FLAGS="-march=corei7" <vectorscan-source-path>

 For more information, refer to :ref:`instr_specialization`.

@ -232,17 +280,17 @@ For more information, refer to :ref:`instr_specialization`.
 Fat Runtime
 -----------

-A feature introduced in Hyperscan v4.4 is the ability for the Hyperscan
+A feature introduced in Hyperscan v4.4 is the ability for the Vectorscan
 library to dispatch the most appropriate runtime code for the host processor.
-This feature is called the "fat runtime", as a single Hyperscan library
+This feature is called the "fat runtime", as a single Vectorscan library
 contains multiple copies of the runtime code for different instruction sets.

 .. note::

    The fat runtime feature is only available on Linux. Release builds of
-    Hyperscan will default to having the fat runtime enabled where supported.
+    Vectorscan will default to having the fat runtime enabled where supported.

-When building the library with the fat runtime, the Hyperscan runtime code
+When building the library with the fat runtime, the Vectorscan runtime code
 will be compiled multiple times for these different instruction sets, and
 these compiled objects are combined into one library. There are no changes to
 how user applications are built against this library.
@ -254,11 +302,11 @@ resolved so that the right version of each API function is used. There is no
 impact on function call performance, as this check and resolution is performed
 by the ELF loader once when the binary is loaded.

-If the Hyperscan library is used on x86 systems without ``SSSE3``, the runtime
+If the Vectorscan library is used on x86 systems without ``SSSE4.2``, the runtime
 API functions will resolve to functions that return :c:member:`HS_ARCH_ERROR`
 instead of potentially executing illegal instructions. The API function
 :c:func:`hs_valid_platform` can be used by application writers to determine if
-the current platform is supported by Hyperscan.
+the current platform is supported by Vectorscan.

 As of this release, the variants of the runtime that are built, and the CPU
 capability that is required, are the following:
@ -299,6 +347,11 @@ capability that is required, are the following:

        cmake -DBUILD_AVX512VBMI=on <...>

+    Vectorscan add support for Arm processors and SVE, SV2 and SVE2_BITPERM.
+    example: ::
+
+        cmake -DBUILD_SVE=ON -DBUILD_SVE2=ON -DBUILD_SVE2_BITPERM=ON <...>
+
 As the fat runtime requires compiler, libc, and binutils support, at this time
 it will only be enabled for Linux builds where the compiler supports the
 `indirect function "ifunc" function attribute
--- a/doc/dev-reference/index.rst
+++ b/doc/dev-reference/index.rst
@ -1,5 +1,5 @@
 ###############################################
-Hyperscan |version| Developer's Reference Guide
+Vectorscan |version| Developer's Reference Guide
 ###############################################

 -------
--- a/doc/dev-reference/intro.rst
+++ b/doc/dev-reference/intro.rst
@ -5,11 +5,11 @@
 Introduction
 ############

-Hyperscan is a software regular expression matching engine designed with
+Vectorscan is a software regular expression matching engine designed with
 high performance and flexibility in mind. It is implemented as a library that
 exposes a straightforward C API.

-The Hyperscan API itself is composed of two major components:
+The Vectorscan API itself is composed of two major components:

 ***********
 Compilation
@ -17,7 +17,7 @@ Compilation

 These functions take a group of regular expressions, along with identifiers and
 option flags, and compile them into an immutable database that can be used by
-the Hyperscan scanning API. This compilation process performs considerable
+the Vectorscan scanning API. This compilation process performs considerable
 analysis and optimization work in order to build a database that will match the
 given expressions efficiently.

@ -36,8 +36,8 @@ See :ref:`compilation` for more detail.
 Scanning
 ********

-Once a Hyperscan database has been created, it can be used to scan data in
-memory. Hyperscan provides several scanning modes, depending on whether the
+Once a Vectorscan database has been created, it can be used to scan data in
+memory. Vectorscan provides several scanning modes, depending on whether the
 data to be scanned is available as a single contiguous block, whether it is
 distributed amongst several blocks in memory at the same time, or whether it is
 to be scanned as a sequence of blocks in a stream.
@ -45,7 +45,7 @@ to be scanned as a sequence of blocks in a stream.
 Matches are delivered to the application via a user-supplied callback function
 that is called synchronously for each match.

-For a given database, Hyperscan provides several guarantees:
+For a given database, Vectorscan provides several guarantees:

 * No memory allocations occur at runtime with the exception of two
  fixed-size allocations, both of which should be done ahead of time for
@ -56,7 +56,7 @@ For a given database, Hyperscan provides several guarantees:
    call.
  - **Stream state**: in streaming mode only, some state space is required to
    store data that persists between scan calls for each stream. This allows
-    Hyperscan to track matches that span multiple blocks of data.
+    Vectorscan to track matches that span multiple blocks of data.

 * The sizes of the scratch space and stream state (in streaming mode) required
  for a given database are fixed and determined at database compile time. This
@ -64,7 +64,7 @@ For a given database, Hyperscan provides several guarantees:
  time, and these structures can be pre-allocated if required for performance
  reasons.

-* Any pattern that has successfully been compiled by the Hyperscan compiler can
+* Any pattern that has successfully been compiled by the Vectorscan compiler can
  be scanned against any input. There are no internal resource limits or other
  limitations at runtime that could cause a scan call to return an error.

@ -74,12 +74,12 @@ See :ref:`runtime` for more detail.
 Tools
 *****

-Some utilities for testing and benchmarking Hyperscan are included with the
+Some utilities for testing and benchmarking Vectorscan are included with the
 library. See :ref:`tools` for more information.

 ************
 Example Code
 ************

-Some simple example code demonstrating the use of the Hyperscan API is
-available in the ``examples/`` subdirectory of the Hyperscan distribution.
+Some simple example code demonstrating the use of the Vectorscan API is
+available in the ``examples/`` subdirectory of the Vectorscan distribution.
--- a/doc/dev-reference/performance.rst
+++ b/doc/dev-reference/performance.rst
@ -4,7 +4,7 @@
 Performance Considerations
 ##########################

-Hyperscan supports a wide range of patterns in all three scanning modes. It is
+Vectorscan supports a wide range of patterns in all three scanning modes. It is
 capable of extremely high levels of performance, but certain patterns can
 reduce performance markedly.

@ -25,7 +25,7 @@ For example, caseless matching of :regexp:`/abc/` can be written as:
 * :regexp:`/(?i)abc(?-i)/`
 * :regexp:`/abc/i`

-Hyperscan is capable of handling all these constructs. Unless there is a
+Vectorscan is capable of handling all these constructs. Unless there is a
 specific reason otherwise, do not rewrite patterns from one form to another.

 As another example, matching of :regexp:`/foo(bar|baz)(frotz)?/` can be
@ -41,24 +41,24 @@ Library usage

 .. tip:: Do not hand-optimize library usage.

-The Hyperscan library is capable of dealing with small writes, unusually large
+The Vectorscan library is capable of dealing with small writes, unusually large
 and small pattern sets, etc. Unless there is a specific performance problem
-with some usage of the library, it is best to use Hyperscan in a simple and
+with some usage of the library, it is best to use Vectorscan in a simple and
 direct fashion. For example, it is unlikely for there to be much benefit in
 buffering input to the library into larger blocks unless streaming writes are
 tiny (say, 1-2 bytes at a time).

-Unlike many other pattern matching products, Hyperscan will run faster with
+Unlike many other pattern matching products, Vectorscan will run faster with
 small numbers of patterns and slower with large numbers of patterns in a smooth
 fashion (as opposed to, typically, running at a moderate speed up to some fixed
 limit then either breaking or running half as fast).

-Hyperscan also provides high-throughput matching with a single thread of
-control per core; if a database runs at 3.0 Gbps in Hyperscan it means that a
+Vectorscan also provides high-throughput matching with a single thread of
+control per core; if a database runs at 3.0 Gbps in Vectorscan it means that a
 3000-bit block of data will be scanned in 1 microsecond in a single thread of
 control, not that it is required to scan 22 3000-bit blocks of data in 22
 microseconds. Thus, it is not usually necessary to buffer data to supply
-Hyperscan with available parallelism.
+Vectorscan with available parallelism.

 ********************
 Block-based matching
@ -72,7 +72,7 @@ accumulated before processing, it should be scanned in block rather than in
 streaming mode.

 Unnecessary use of streaming mode reduces the number of optimizations that can
-be applied in Hyperscan and may make some patterns run slower.
+be applied in Vectorscan and may make some patterns run slower.

 If there is a mixture of 'block' and 'streaming' mode patterns, these should be
 scanned in separate databases except in the case that the streaming patterns
@ -107,7 +107,7 @@ Allocate scratch ahead of time

 Scratch allocation is not necessarily a cheap operation. Since it is the first
 time (after compilation or deserialization) that a pattern database is used,
-Hyperscan performs some validation checks inside :c:func:`hs_alloc_scratch` and
+Vectorscan performs some validation checks inside :c:func:`hs_alloc_scratch` and
 must also allocate memory.

 Therefore, it is important to ensure that :c:func:`hs_alloc_scratch` is not
@ -329,7 +329,7 @@ Consequently, :regexp:`/foo.*bar/L` with a check on start of match values after
 the callback is considerably more expensive and general than
 :regexp:`/foo.{300}bar/`.

-Similarly, the :c:member:`hs_expr_ext::min_length` extended parameter can be
+Similarly, the :cpp:member:`hs_expr_ext::min_length` extended parameter can be
 used to specify a lower bound on the length of the matches for a pattern. Using
 this facility may be more lightweight in some circumstances than using the SOM
 flag and post-confirming match length in the calling application.
--- a/doc/dev-reference/preface.rst
+++ b/doc/dev-reference/preface.rst
@ -6,35 +6,35 @@ Preface
 Overview
 ********

-Hyperscan is a regular expression engine designed to offer high performance, the
+Vectorscan is a regular expression engine designed to offer high performance, the
 ability to match multiple expressions simultaneously and flexibility in
 scanning operation.

 Patterns are provided to a compilation interface which generates an immutable
 pattern database. The scan interface then can be used to scan a target data
 buffer for the given patterns, returning any matching results from that data
-buffer. Hyperscan also provides a streaming mode, in which matches that span
+buffer. Vectorscan also provides a streaming mode, in which matches that span
 several blocks in a stream are detected.

-This document is designed to facilitate code-level integration of the Hyperscan
+This document is designed to facilitate code-level integration of the Vectorscan
 library with existing or new applications.

-:ref:`intro` is a short overview of the Hyperscan library, with more detail on
-the Hyperscan API provided in the subsequent sections: :ref:`compilation` and
+:ref:`intro` is a short overview of the Vectorscan library, with more detail on
+the Vectorscan API provided in the subsequent sections: :ref:`compilation` and
 :ref:`runtime`.

 :ref:`perf` provides details on various factors which may impact the
-performance of a Hyperscan integration.
+performance of a Vectorscan integration.

 :ref:`api_constants` and :ref:`api_files` provides a detailed summary of the
-Hyperscan Application Programming Interface (API).
+Vectorscan Application Programming Interface (API).

 ********
 Audience
 ********

-This guide is aimed at developers interested in integrating Hyperscan into an
-application. For information on building the Hyperscan library, see the Quick
+This guide is aimed at developers interested in integrating Vectorscan into an
+application. For information on building the Vectorscan library, see the Quick
 Start Guide.

 ***********
--- a/doc/dev-reference/runtime.rst
+++ b/doc/dev-reference/runtime.rst
@ -4,7 +4,7 @@
 Scanning for Patterns
 #####################

-Hyperscan provides three different scanning modes, each with its own scan
+Vectorscan provides three different scanning modes, each with its own scan
 function beginning with ``hs_scan``. In addition, streaming mode has a number
 of other API functions for managing stream state.

@ -33,8 +33,8 @@ See :c:type:`match_event_handler` for more information.
 Streaming Mode
 **************

-The core of the Hyperscan streaming runtime API consists of functions to open,
-scan, and close Hyperscan data streams:
+The core of the Vectorscan streaming runtime API consists of functions to open,
+scan, and close Vectorscan data streams:

 * :c:func:`hs_open_stream`: allocates and initializes a new stream for scanning.

@ -57,14 +57,14 @@ will return immediately with :c:member:`HS_SCAN_TERMINATED`. The caller must
 still call :c:func:`hs_close_stream` to complete the clean-up process for that
 stream.

-Streams exist in the Hyperscan library so that pattern matching state can be
+Streams exist in the Vectorscan library so that pattern matching state can be
 maintained across multiple blocks of target data -- without maintaining this
 state, it would not be possible to detect patterns that span these blocks of
 data. This, however, does come at the cost of requiring an amount of storage
 per-stream (the size of this storage is fixed at compile time), and a slight
 performance penalty in some cases to manage the state.

-While Hyperscan does always support a strict ordering of multiple matches,
+While Vectorscan does always support a strict ordering of multiple matches,
 streaming matches will not be delivered at offsets before the current stream
 write, with the exception of zero-width asserts, where constructs such as
 :regexp:`\\b` and :regexp:`$` can cause a match on the final character of a
@ -76,7 +76,7 @@ Stream Management
 =================

 In addition to :c:func:`hs_open_stream`, :c:func:`hs_scan_stream`, and
-:c:func:`hs_close_stream`, the Hyperscan API provides a number of other
+:c:func:`hs_close_stream`, the Vectorscan API provides a number of other
 functions for the management of streams:

 * :c:func:`hs_reset_stream`: resets a stream to its initial state; this is
@ -98,10 +98,10 @@ A stream object is allocated as a fixed size region of memory which has been
 sized to ensure that no memory allocations are required during scan
 operations. When the system is under memory pressure, it may be useful to reduce
 the memory consumed by streams that are not expected to be used soon. The
-Hyperscan API provides calls for translating a stream to and from a compressed
+Vectorscan API provides calls for translating a stream to and from a compressed
 representation for this purpose. The compressed representation differs from the
 full stream object as it does not reserve space for components which are not
-required given the current stream state. The Hyperscan API functions for this
+required given the current stream state. The Vectorscan API functions for this
 functionality are:

 * :c:func:`hs_compress_stream`: fills the provided buffer with a compressed
@ -157,7 +157,7 @@ scanned in block mode.
 Scratch Space
 *************

-While scanning data, Hyperscan needs a small amount of temporary memory to store
+While scanning data, Vectorscan needs a small amount of temporary memory to store
 on-the-fly internal data. This amount is unfortunately too large to fit on the
 stack, particularly for embedded applications, and allocating memory dynamically
 is too expensive, so a pre-allocated "scratch" space must be provided to the
@ -170,7 +170,7 @@ databases, only a single scratch region is necessary: in this case, calling
 will ensure that the scratch space is large enough to support scanning against
 any of the given databases.

-While the Hyperscan library is re-entrant, the use of scratch spaces is not.
+While the Vectorscan library is re-entrant, the use of scratch spaces is not.
 For example, if by design it is deemed necessary to run recursive or nested
 scanning (say, from the match callback function), then an additional scratch
 space is required for that context.
@ -219,11 +219,11 @@ For example:
 Custom Allocators
 *****************

-By default, structures used by Hyperscan at runtime (scratch space, stream
+By default, structures used by Vectorscan at runtime (scratch space, stream
 state, etc) are allocated with the default system allocators, usually
 ``malloc()`` and ``free()``.

-The Hyperscan API provides a facility for changing this behaviour to support
+The Vectorscan API provides a facility for changing this behaviour to support
 applications that use custom memory allocators.

 These functions are:
--- a/doc/dev-reference/serialization.rst
+++ b/doc/dev-reference/serialization.rst
@ -4,7 +4,7 @@
 Serialization
 #############

-For some applications, compiling Hyperscan pattern databases immediately prior
+For some applications, compiling Vectorscan pattern databases immediately prior
 to use is not an appropriate design. Some users may wish to:

 * Compile pattern databases on a different host;
@ -14,9 +14,9 @@ to use is not an appropriate design. Some users may wish to:

 * Control the region of memory in which the compiled database is located.

-Hyperscan pattern databases are not completely flat in memory: they contain
+Vectorscan pattern databases are not completely flat in memory: they contain
 pointers and have specific alignment requirements. Therefore, they cannot be
-copied (or otherwise relocated) directly. To enable these use cases, Hyperscan
+copied (or otherwise relocated) directly. To enable these use cases, Vectorscan
 provides functionality for serializing and deserializing compiled pattern
 databases.

@ -40,10 +40,10 @@ The API provides the following functions:
   returns a string containing information about the database. This call is
   analogous to :c:func:`hs_database_info`.

-.. note:: Hyperscan performs both version and platform compatibility checks
+.. note:: Vectorscan performs both version and platform compatibility checks
   upon deserialization. The :c:func:`hs_deserialize_database` and
   :c:func:`hs_deserialize_database_at` functions will only permit the
-   deserialization of databases compiled with (a) the same version of Hyperscan
+   deserialization of databases compiled with (a) the same version of Vectorscan
   and (b) platform features supported by the current host platform. See
   :ref:`instr_specialization` for more information on platform specialization.

@ -51,17 +51,17 @@ The API provides the following functions:
 The Runtime Library
 ===================

-The main Hyperscan library (``libhs``) contains both the compiler and runtime
-portions of the library. This means that in order to support the Hyperscan
+The main Vectorscan library (``libhs``) contains both the compiler and runtime
+portions of the library. This means that in order to support the Vectorscan
 compiler, which is written in C++, it requires C++ linkage and has a
 dependency on the C++ standard library.

 Many embedded applications require only the scanning ("runtime") portion of the
-Hyperscan library. In these cases, pattern compilation generally takes place on
+Vectorscan library. In these cases, pattern compilation generally takes place on
 another host, and serialized pattern databases are delivered to the application
 for use.

 To support these applications without requiring the C++ dependency, a
-runtime-only version of the Hyperscan library, called ``libhs_runtime``, is also
+runtime-only version of the Vectorscan library, called ``libhs_runtime``, is also
 distributed. This library does not depend on the C++ standard library and
-provides all Hyperscan functions other that those used to compile databases.
+provides all Vectorscan functions other that those used to compile databases.
--- a/doc/dev-reference/tools.rst
+++ b/doc/dev-reference/tools.rst
@ -4,14 +4,14 @@
 Tools
 #####

-This section describes the set of utilities included with the Hyperscan library.
+This section describes the set of utilities included with the Vectorscan library.

 ********************
 Quick Check: hscheck
 ********************

-The ``hscheck`` tool allows the user to quickly check whether Hyperscan supports
-a group of patterns. If a pattern is rejected by Hyperscan's compiler, the
+The ``hscheck`` tool allows the user to quickly check whether Vectorscan supports
+a group of patterns. If a pattern is rejected by Vectorscan's compiler, the
 compile error is provided on standard output.

 For example, given the following three patterns (the last of which contains a
@ -34,7 +34,7 @@ syntax error) in a file called ``/tmp/test``::
 Benchmarker: hsbench
 ********************

-The ``hsbench`` tool provides an easy way to measure Hyperscan's performance
+The ``hsbench`` tool provides an easy way to measure Vectorscan's performance
 for a particular set of patterns and corpus of data to be scanned.

 Patterns are supplied in the format described below in
@ -44,7 +44,7 @@ easy control of how a corpus is broken into blocks and streams.

 .. note:: A group of Python scripts for constructing corpora databases from
   various input types, such as PCAP network traffic captures or text files, can
-   be found in the Hyperscan source tree in ``tools/hsbench/scripts``.
+   be found in the Vectorscan source tree in ``tools/hsbench/scripts``.

 Running hsbench
 ===============
@ -56,7 +56,7 @@ produce output like this::
    $ hsbench -e /tmp/patterns -c /tmp/corpus.db

    Signatures:        /tmp/patterns
-    Hyperscan info:    Version: 4.3.1 Features:  AVX2 Mode: STREAM
+    Vectorscan info:    Version: 5.4.11 Features:  AVX2 Mode: STREAM
    Expression count:  200
    Bytecode size:     342,540 bytes
    Database CRC:      0x6cd6b67c
@ -77,7 +77,7 @@ takes to perform all twenty scans. The number of repeats can be changed with the
 ``-n`` argument, and the results of each scan will be displayed if the
 ``--per-scan`` argument is specified.

-To benchmark Hyperscan on more than one core, you can supply a list of cores
+To benchmark Vectorscan on more than one core, you can supply a list of cores
 with the ``-T`` argument, which will instruct ``hsbench`` to start one
 benchmark thread per core given and compute the throughput from the time taken
 to complete all of them.
@ -91,17 +91,17 @@ Correctness Testing: hscollider
 *******************************

 The ``hscollider`` tool, or Pattern Collider, provides a way to verify
-Hyperscan's matching behaviour. It does this by compiling and scanning patterns
+Vectorscan's matching behaviour. It does this by compiling and scanning patterns
 (either singly or in groups) against known corpora and comparing the results
 against another engine (the "ground truth"). Two sources of ground truth for
 comparison are available:

 * The PCRE library (http://pcre.org/).
- * An NFA simulation run on Hyperscan's compile-time graph representation. This
+ * An NFA simulation run on Vectorscan's compile-time graph representation. This
   is used if PCRE cannot support the pattern or if PCRE execution fails due to
   a resource limit.

-Much of Hyperscan's testing infrastructure is built on ``hscollider``, and the
+Much of Vectorscan's testing infrastructure is built on ``hscollider``, and the
 tool is designed to take advantage of multiple cores and provide considerable
 flexibility in controlling the test. These options are described in the help
 (``hscollider -h``) and include:
@ -116,11 +116,11 @@ flexibility in controlling the test. These options are described in the help
 Using hscollider to debug a pattern
 ===================================

-One common use-case for ``hscollider`` is to determine whether Hyperscan will
+One common use-case for ``hscollider`` is to determine whether Vectorscan will
 match a pattern in the expected location, and whether this accords with PCRE's
 behaviour for the same case.

-Here is an example. We put our pattern in a file in Hyperscan's pattern
+Here is an example. We put our pattern in a file in Vectorscan's pattern
 format::

    $ cat /tmp/pat
@ -172,7 +172,7 @@ individual matches are displayed in the output::

    Total elapsed time: 0.00522815 secs.

-We can see from this output that both PCRE and Hyperscan find matches ending at
+We can see from this output that both PCRE and Vectorscan find matches ending at
 offset 33 and 45, and so ``hscollider`` considers this test case to have
 passed.

@ -180,13 +180,13 @@ passed.
 corpus alignment 0, and ``-T 1`` instructs us to only use one thread.)

 .. note:: In default operation, PCRE produces only one match for a scan, unlike
-  Hyperscan's automata semantics. The ``hscollider`` tool uses libpcre's
-  "callout" functionality to match Hyperscan's semantics.
+  Vectorscan's automata semantics. The ``hscollider`` tool uses libpcre's
+  "callout" functionality to match Vectorscan's semantics.

 Running a larger scan test
 ==========================

-A set of patterns for testing purposes are distributed with Hyperscan, and these
+A set of patterns for testing purposes are distributed with Vectorscan, and these
 can be tested via ``hscollider`` on an in-tree build. Two CMake targets are
 provided to do this easily:

@ -202,10 +202,10 @@ Debugging: hsdump
 *****************

 When built in debug mode (using the CMake directive ``CMAKE_BUILD_TYPE`` set to
-``Debug``), Hyperscan includes support for dumping information about its
+``Debug``), Vectorscan includes support for dumping information about its
 internals during pattern compilation with the ``hsdump`` tool.

-This information is mostly of use to Hyperscan developers familiar with the
+This information is mostly of use to Vectorscan developers familiar with the
 library's internal structure, but can be used to diagnose issues with patterns
 and provide more information in bug reports.

@ -215,7 +215,7 @@ and provide more information in bug reports.
 Pattern Format
 **************

-All of the Hyperscan tools accept patterns in the same format, read from plain
+All of the Vectorscan tools accept patterns in the same format, read from plain
 text files with one pattern per line. Each line looks like this:

 * ``<integer id>:/<regex>/<flags>``
@ -227,12 +227,12 @@ For example::
    3:/^.{10,20}hatstand/m

 The integer ID is the value that will be reported when a match is found by
-Hyperscan and must be unique.
+Vectorscan and must be unique.

 The pattern itself is a regular expression in PCRE syntax; see
 :ref:`compilation` for more information on supported features.

-The flags are single characters that map to Hyperscan flags as follows:
+The flags are single characters that map to Vectorscan flags as follows:

 =========   =================================    ===========
 Character   API Flag                             Description
@ -256,7 +256,7 @@ between braces, separated by commas. For example::

    1:/hatstand.*teakettle/s{min_offset=50,max_offset=100}

-All Hyperscan tools will accept a pattern file (or a directory containing
+All Vectorscan tools will accept a pattern file (or a directory containing
 pattern files) with the ``-e`` argument. If no further arguments constraining
 the pattern set are given, all patterns in those files are used.

--- a/examples/patbench.cc
+++ b/examples/patbench.cc
@ -1,5 +1,6 @@
 /*
 * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2024, VectorCamp PC
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -134,7 +135,12 @@
 #include <netinet/tcp.h>
 #include <netinet/udp.h>
 #include <netinet/ip_icmp.h>
+#ifdef __NetBSD__
+#include <net/ethertypes.h>
+#include <net/if_ether.h>
+#else
 #include <net/ethernet.h>
+#endif /* __NetBSD__ */
 #include <arpa/inet.h>

 #include <pcap.h>
@ -196,15 +202,15 @@ struct FiveTuple {
    unsigned int dstPort;

    // Construct a FiveTuple from a TCP or UDP packet.
-    FiveTuple(const struct ip *iphdr) {
+    explicit FiveTuple(const struct ip *iphdr) {
        // IP fields
        protocol = iphdr->ip_p;
        srcAddr = iphdr->ip_src.s_addr;
        dstAddr = iphdr->ip_dst.s_addr;

        // UDP/TCP ports
-        const struct udphdr *uh = (const struct udphdr *)
-                (((const char *)iphdr) + (iphdr->ip_hl * 4));
+        const struct udphdr *uh = reinterpret_cast<const struct udphdr *>
+                ((reinterpret_cast<const char *>(iphdr)) + (iphdr->ip_hl * 4));
        srcPort = uh->uh_sport;
        dstPort = uh->uh_dport;
    }
@ -233,7 +239,7 @@ static
 int onMatch(unsigned int id, unsigned long long from, unsigned long long to,
            unsigned int flags, void *ctx) {
    // Our context points to a size_t storing the match count
-    size_t *matches = (size_t *)ctx;
+    size_t *matches = static_cast<size_t *>(ctx);
    (*matches)++;
    return 0; // continue matching
 }
@ -295,7 +301,7 @@ public:
        // database.
        hs_error_t err = hs_alloc_scratch(db, &scratch);
        if (err != HS_SUCCESS) {
-            cerr << "ERROR: could not allocate scratch space. Exiting." << endl;
+            cerr << "ERROR: could not allocate scratch space. Exiting.\n";
            exit(-1);
        }
    }
@ -307,8 +313,7 @@ public:
        size_t scratch_size;
        hs_error_t err = hs_scratch_size(scratch, &scratch_size);
        if (err != HS_SUCCESS) {
-            cerr << "ERROR: could not query scratch space size. Exiting."
-                 << endl;
+            cerr << "ERROR: could not query scratch space size. Exiting.\n";
            exit(-1);
        }
        return scratch_size;
@ -334,9 +339,9 @@ public:
            }

            // Valid TCP or UDP packet
-            const struct ip *iphdr = (const struct ip *)(pktData
+            const struct ip *iphdr = reinterpret_cast<const struct ip *>(pktData
                    + sizeof(struct ether_header));
-            const char *payload = (const char *)pktData + offset;
+            const char *payload = reinterpret_cast<const char *>(pktData) + offset;

            size_t id = stream_map.insert(std::make_pair(FiveTuple(iphdr),
                                          stream_map.size())).first->second;
@ -352,9 +357,8 @@ public:
    // Return the number of bytes scanned
    size_t bytes() const {
        size_t sum = 0;
-        for (const auto &packet : packets) {
-            sum += packet.size();
-        }
+        auto packs = [](size_t z, const string &packet) { return z + packet.size(); };
+        sum += std::accumulate(packets.begin(), packets.end(), 0, packs);
        return sum;
    }

@ -374,7 +378,7 @@ public:
        for (auto &stream : streams) {
            hs_error_t err = hs_open_stream(db, 0, &stream);
            if (err != HS_SUCCESS) {
-                cerr << "ERROR: Unable to open stream. Exiting." << endl;
+                cerr << "ERROR: Unable to open stream. Exiting.\n";
                exit(-1);
            }
        }
@ -383,11 +387,11 @@ public:
    // Close all open Hyperscan streams (potentially generating any
    // end-anchored matches)
    void closeStreams() {
-        for (auto &stream : streams) {
+        for (const auto &stream : streams) {
            hs_error_t err =
                hs_close_stream(stream, scratch, onMatch, &matchCount);
            if (err != HS_SUCCESS) {
-                cerr << "ERROR: Unable to close stream. Exiting." << endl;
+                cerr << "ERROR: Unable to close stream. Exiting.\n";
                exit(-1);
            }
        }
@ -402,7 +406,7 @@ public:
                                            pkt.c_str(), pkt.length(), 0,
                                            scratch, onMatch, &matchCount);
            if (err != HS_SUCCESS) {
-                cerr << "ERROR: Unable to scan packet. Exiting." << endl;
+                cerr << "ERROR: Unable to scan packet. Exiting.\n";
                exit(-1);
            }
        }
@ -416,7 +420,7 @@ public:
            hs_error_t err = hs_scan(db, pkt.c_str(), pkt.length(), 0,
                                     scratch, onMatch, &matchCount);
            if (err != HS_SUCCESS) {
-                cerr << "ERROR: Unable to scan packet. Exiting." << endl;
+                cerr << "ERROR: Unable to scan packet. Exiting.\n";
                exit(-1);
            }
        }
@ -436,7 +440,7 @@ class Sigdata {

 public:
    Sigdata() {}
-    Sigdata(const char *filename) {
+    explicit Sigdata(const char *filename) {
        parseFile(filename, patterns, flags, ids, originals);

    }
@ -454,9 +458,8 @@ public:
        // dynamic storage.)
        vector<const char *> cstrPatterns;
        cstrPatterns.reserve(patterns.size());
-        for (const auto &pattern : patterns) {
-            cstrPatterns.push_back(pattern.c_str());
-        }
+        auto pstr = [](const string &pattern) { return pattern.c_str(); };
+        std::transform(patterns.begin(), patterns.end(), std::back_inserter(cstrPatterns), pstr);

        Clock clock;
        clock.start();
@ -505,29 +508,29 @@ public:

 static
 void usage(const char *) {
-    cerr << "Usage:" << endl << endl;
-    cerr << "  patbench [-n repeats] [ -G generations] [ -C criterion ]" << endl
+    cerr << "Usage:\n\n";
+    cerr << "  patbench [-n repeats] [ -G generations] [ -C criterion ]\n"
         << "           [ -F factor_group_size ] [ -N | -S ] "
-         << "<pattern file> <pcap file>" << endl << endl
+         << "<pattern file> <pcap file>\n\n"
         << "    -n repeats sets the number of times the PCAP is repeatedly "
-            "scanned" << endl << "       with the pattern." << endl
+            "scanned\n" << "       with the pattern.\n"
         << "    -G generations sets the number of generations that the "
-            "algorithm is" << endl << "       run for." << endl
+            "algorithm is\n" << "       run for.\n"
         << "    -N sets non-streaming mode, -S sets streaming mode (default)."
         << endl << "    -F sets the factor group size (must be >0); this "
-                    "allows the detection" << endl
-         << "       of multiple interacting factors." << endl << "" << endl
-         << "    -C sets the 'criterion', which can be either:" << endl
+                    "allows the detection\n"
+         << "       of multiple interacting factors.\n" << "\n"
+         << "    -C sets the 'criterion', which can be either:\n"
         << "         t  throughput (the default) - this requires a pcap file"
-         << endl << "         r  scratch size" << endl
-         << "         s  stream state size" << endl
-         << "         c  compile time" << endl << "         b  bytecode size"
+         << endl << "         r  scratch size\n"
+         << "         s  stream state size\n"
+         << "         c  compile time\n" << "         b  bytecode size"
         << endl << endl
         << "We recommend the use of a utility like 'taskset' on "
-            "multiprocessor hosts to" << endl
+            "multiprocessor hosts to\n"
         << "lock execution to a single processor: this will remove processor "
-            "migration" << endl
-         << "by the scheduler as a source of noise in the results." << endl;
+            "migration\n"
+         << "by the scheduler as a source of noise in the results.\n";
 }

 static
@ -559,7 +562,7 @@ double measure_block_time(Benchmark &bench, unsigned int repeatCount) {
 }

 static
-double eval_set(Benchmark &bench, Sigdata &sigs, unsigned int mode,
+double eval_set(Benchmark &bench, const Sigdata &sigs, unsigned int mode,
                unsigned repeatCount, Criterion criterion,
                bool diagnose = true) {
    double compileTime = 0;
@ -570,7 +573,7 @@ double eval_set(Benchmark &bench, Sigdata &sigs, unsigned int mode,
        size_t dbSize;
        hs_error_t err = hs_database_size(bench.getDatabase(), &dbSize);
        if (err != HS_SUCCESS) {
-            cerr << "ERROR: could not retrieve bytecode size" << endl;
+            cerr << "ERROR: could not retrieve bytecode size\n";
            exit(1);
        }
        return dbSize;
@ -581,7 +584,7 @@ double eval_set(Benchmark &bench, Sigdata &sigs, unsigned int mode,
        size_t streamStateSize;
        hs_error_t err = hs_stream_size(bench.getDatabase(), &streamStateSize);
        if (err != HS_SUCCESS) {
-            cerr << "ERROR: could not retrieve stream state size" << endl;
+            cerr << "ERROR: could not retrieve stream state size\n";
            exit(1);
        }
        return streamStateSize;
@ -599,8 +602,9 @@ double eval_set(Benchmark &bench, Sigdata &sigs, unsigned int mode,
        scan_time = measure_stream_time(bench, repeatCount);
    }
    size_t bytes = bench.bytes();
-    size_t matches = bench.matches();
+    
    if (diagnose) {
+        size_t matches = bench.matches();
        std::ios::fmtflags f(cout.flags());
        cout << "Scan time " << std::fixed << std::setprecision(3) << scan_time
             << " sec, Scanned " << bytes * repeatCount << " bytes, Throughput "
@ -679,14 +683,13 @@ int main(int argc, char **argv) {
    Benchmark bench;
    if (criterion == CRITERION_THROUGHPUT) {
        if (!bench.readStreams(pcapFile)) {
-            cerr << "Unable to read packets from PCAP file. Exiting." << endl;
+            cerr << "Unable to read packets from PCAP file. Exiting.\n";
            exit(-1);
        }
    }

    if ((criterion == CRITERION_STREAM_STATE) && (mode != HS_MODE_STREAM)) {
-        cerr << "Cannot evaluate stream state for block mode compile. Exiting."
-             << endl;
+        cerr << "Cannot evaluate stream state for block mode compile. Exiting.\n";
        exit(-1);
    }

@ -724,7 +727,7 @@ int main(int argc, char **argv) {
    unsigned generations = min(gen_max, (sigs.size() - 1) / factor_max);

    cout << "Cutting signatures cumulatively for " << generations
-         << " generations" << endl;
+         << " generations\n";
    for (unsigned gen = 0; gen < generations; ++gen) {
        cout << "Generation " << gen << " ";
        set<unsigned> s(work_sigs.begin(), work_sigs.end());
@ -768,7 +771,7 @@ int main(int argc, char **argv) {
        cout << "Performance: ";
        print_criterion(criterion, best);
        cout << " (" << std::fixed << std::setprecision(3) << (best / score_base)
-             << "x) after cutting:" << endl;
+             << "x) after cutting:\n";
        cout.flags(out_f);

        // s now has factor_max signatures
@ -791,7 +794,7 @@ int main(int argc, char **argv) {
 static
 bool payloadOffset(const unsigned char *pkt_data, unsigned int *offset,
                   unsigned int *length) {
-    const ip *iph = (const ip *)(pkt_data + sizeof(ether_header));
+    const ip *iph = reinterpret_cast<const ip *>(pkt_data + sizeof(ether_header));
    const tcphdr *th = nullptr;

    // Ignore packets that aren't IPv4
@ -810,7 +813,7 @@ bool payloadOffset(const unsigned char *pkt_data, unsigned int *offset,

    switch (iph->ip_p) {
    case IPPROTO_TCP:
-        th = (const tcphdr *)((const char *)iph + ihlen);
+        th = reinterpret_cast<const tcphdr *>(reinterpret_cast<const char *>(iph) + ihlen);
        thlen = th->th_off * 4;
        break;
    case IPPROTO_UDP:
@ -847,7 +850,7 @@ static unsigned parseFlags(const string &flagsStr) {
        case '\r': // stray carriage-return
            break;
        default:
-            cerr << "Unsupported flag \'" << c << "\'" << endl;
+            cerr << "Unsupported flag \'" << c << "\'\n";
            exit(-1);
        }
    }
@ -859,7 +862,7 @@ static void parseFile(const char *filename, vector<string> &patterns,
                      vector<string> &originals) {
    ifstream inFile(filename);
    if (!inFile.good()) {
-        cerr << "ERROR: Can't open pattern file \"" << filename << "\"" << endl;
+        cerr << "ERROR: Can't open pattern file \"" << filename << "\"\n";
        exit(-1);
    }

@ -889,7 +892,7 @@ static void parseFile(const char *filename, vector<string> &patterns,

        size_t flagsStart = expr.find_last_of('/');
        if (flagsStart == string::npos) {
-            cerr << "ERROR: no trailing '/' char" << endl;
+            cerr << "ERROR: no trailing '/' char\n";
            exit(-1);
        }

--- a/examples/pcapscan.cc
+++ b/examples/pcapscan.cc
@ -1,5 +1,6 @@
 /*
 * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2024, VectorCamp PC
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -54,6 +55,7 @@
 #include <fstream>
 #include <iomanip>
 #include <iostream>
+#include <numeric>
 #include <string>
 #include <unordered_map>
 #include <vector>
@ -68,7 +70,12 @@
 #include <netinet/tcp.h>
 #include <netinet/udp.h>
 #include <netinet/ip_icmp.h>
+#ifdef __NetBSD__
+#include <net/ethertypes.h>
+#include <net/if_ether.h>
+#else
 #include <net/ethernet.h>
+#endif /* __NetBSD__ */
 #include <arpa/inet.h>

 #include <pcap.h>
@ -93,15 +100,15 @@ struct FiveTuple {
    unsigned int dstPort;

    // Construct a FiveTuple from a TCP or UDP packet.
-    FiveTuple(const struct ip *iphdr) {
+    explicit FiveTuple(const struct ip *iphdr) {
        // IP fields
        protocol = iphdr->ip_p;
        srcAddr = iphdr->ip_src.s_addr;
        dstAddr = iphdr->ip_dst.s_addr;

        // UDP/TCP ports
-        const struct udphdr *uh =
-            (const struct udphdr *)(((const char *)iphdr) + (iphdr->ip_hl * 4));
+	const char * iphdr_base = reinterpret_cast<const char *>(iphdr);
+        const struct udphdr *uh = reinterpret_cast<const struct udphdr *>(iphdr_base + (iphdr->ip_hl * 4));
        srcPort = uh->uh_sport;
        dstPort = uh->uh_dport;
    }
@ -130,7 +137,7 @@ static
 int onMatch(unsigned int id, unsigned long long from, unsigned long long to,
            unsigned int flags, void *ctx) {
    // Our context points to a size_t storing the match count
-    size_t *matches = (size_t *)ctx;
+    size_t *matches = static_cast<size_t *>(ctx);
    (*matches)++;
    return 0; // continue matching
 }
@ -226,9 +233,8 @@ public:
            }

            // Valid TCP or UDP packet
-            const struct ip *iphdr = (const struct ip *)(pktData
-                    + sizeof(struct ether_header));
-            const char *payload = (const char *)pktData + offset;
+            const struct ip *iphdr = reinterpret_cast<const struct ip *>(pktData + sizeof(struct ether_header));
+            const char *payload = reinterpret_cast<const char *>(pktData) + offset;

            size_t id = stream_map.insert(std::make_pair(FiveTuple(iphdr),
                                          stream_map.size())).first->second;
@ -244,9 +250,8 @@ public:
    // Return the number of bytes scanned
    size_t bytes() const {
        size_t sum = 0;
-        for (const auto &packet : packets) {
-            sum += packet.size();
-        }
+        auto packs = [](size_t z, const string &packet) { return z + packet.size(); };
+        sum += std::accumulate(packets.begin(), packets.end(), 0, packs);
        return sum;
    }

@ -275,7 +280,7 @@ public:
    // Close all open Hyperscan streams (potentially generating any
    // end-anchored matches)
    void closeStreams() {
-        for (auto &stream : streams) {
+        for (const auto &stream : streams) {
            hs_error_t err = hs_close_stream(stream, scratch, onMatch,
                                             &matchCount);
            if (err != HS_SUCCESS) {
@ -427,7 +432,8 @@ static void databasesFromFile(const char *filename,
    // storage.)
    vector<const char*> cstrPatterns;
    for (const auto &pattern : patterns) {
-        cstrPatterns.push_back(pattern.c_str());
+        // cppcheck-suppress useStlAlgorithm
+        cstrPatterns.push_back(pattern.c_str()); //NOLINT (performance-inefficient-vector-operation)
    }

    cout << "Compiling Hyperscan databases with " << patterns.size()
@ -568,7 +574,8 @@ int main(int argc, char **argv) {
 */
 static bool payloadOffset(const unsigned char *pkt_data, unsigned int *offset,
                          unsigned int *length) {
-    const ip *iph = (const ip *)(pkt_data + sizeof(ether_header));
+    const ip *iph = reinterpret_cast<const ip *>(pkt_data + sizeof(ether_header));
+    const char *iph_base = reinterpret_cast<const char *>(iph);
    const tcphdr *th = nullptr;

    // Ignore packets that aren't IPv4
@ -587,7 +594,7 @@ static bool payloadOffset(const unsigned char *pkt_data, unsigned int *offset,

    switch (iph->ip_p) {
    case IPPROTO_TCP:
-        th = (const tcphdr *)((const char *)iph + ihlen);
+        th = reinterpret_cast<const tcphdr *>(iph_base + ihlen);
        thlen = th->th_off * 4;
        break;
    case IPPROTO_UDP:
--- a/examples/simplegrep.c
+++ b/examples/simplegrep.c
@ -67,7 +67,7 @@
 * to pass in the pattern that was being searched for so we can print it out.
 */
 static int eventHandler(unsigned int id, unsigned long long from,
-                        unsigned long long to, unsigned int flags, void *ctx) {
+                        unsigned long long to, unsigned int flags, void *ctx) { // cppcheck-suppress constParameterCallback
    printf("Match for pattern \"%s\" at offset %llu\n", (char *)ctx, to);
    return 0;
 }
@ -150,7 +150,7 @@ int main(int argc, char *argv[]) {
    }

    char *pattern = argv[1];
-    char *inputFN = argv[2];
+    const char *inputFN = argv[2];

    /* First, we attempt to compile the pattern provided on the command line.
     * We assume 'DOTALL' semantics, meaning that the '.' meta-character will
--- a/libhs.pc.in
+++ b/libhs.pc.in
@ -4,7 +4,7 @@ libdir=@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@
 includedir=@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_INCLUDEDIR@

 Name: libhs
-Description: Intel(R) Hyperscan Library
+Description: A portable fork of the high-performance regular expression matching library
 Version: @HS_VERSION@
 Libs: -L${libdir} -lhs
 Cflags: -I${includedir}/hs
--- a/scripts/change_command.py
+++ b/scripts/change_command.py
@ -0,0 +1,53 @@
+#
+#  Copyright (c) 2020-2023, VectorCamp PC
+#  
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#  
+#   * Redistributions of source code must retain the above copyright notice,
+#     this list of conditions and the following disclaimer.
+#   * Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#   * Neither the name of Intel Corporation nor the names of its contributors
+#     may be used to endorse or promote products derived from this software
+#     without specific prior written permission.
+#  
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+import json
+import sys
+
+#reads from the clang-tidy config file the first comment to ignore specific files
+# Get the paths from the command-line arguments
+# python3 ../source/scripts/change_command.py ../source/.clang-tidy ./compile_commands.json
+clang_tidy_config_path = sys.argv[1]
+compile_commands_path = sys.argv[2]
+
+# Load the data from the file
+with open(compile_commands_path, 'r') as f:
+    data = json.load(f)
+
+# Open the clang-tidy config file and read the first comment
+with open(clang_tidy_config_path, 'r') as f:
+    for line in f:
+        if line.startswith('#'):
+            ignore_files = line[1:].strip().split(',')
+            break
+
+# Filter out the entries for the ignored files
+data = [entry for entry in data if not any(ignore_file in entry['file'] for ignore_file in ignore_files)]
+
+# Write the result to the same file
+with open(compile_commands_path, 'w') as f:
+    json.dump(data, f, indent=2)
--- a/1
+++ b/1
@ -0,0 +1 @@
+Subproject commit 416091ebdb9e901b29d026633e73167d6353a0b0
--- a/src/compiler/asserts.cpp
+++ b/src/compiler/asserts.cpp
@ -176,7 +176,8 @@ void replaceAssertVertex(NGHolder &g, NFAVertex t, const ExpressionInfo &expr,
            auto ecit = edge_cache.find(cache_key);
            if (ecit == edge_cache.end()) {
                DEBUG_PRINTF("adding edge %zu %zu\n", g[u].index, g[v].index);
-                NFAEdge e = add_edge(u, v, g);
+                NFAEdge e;
+                std::tie(e, std::ignore) = add_edge(u, v, g);
                edge_cache.emplace(cache_key, e);
                g[e].assert_flags = flags;
                if (++assert_edge_count > MAX_ASSERT_EDGES) {
@ -229,11 +230,12 @@ void checkForMultilineStart(ReportManager &rm, NGHolder &g,

        /* we need to interpose a dummy dot vertex between v and accept if
         * required so that ^ doesn't match trailing \n */
-         for (const auto &e : out_edges_range(v, g)) {
-            if (target(e, g) == g.accept) {
-                dead.emplace_back(e);
-            }
-        }
+        auto deads = [&g=g](const NFAEdge &e) {
+            return (target(e, g) == g.accept);
+        };
+        const auto &er = out_edges_range(v, g);
+        std::copy_if(begin(er), end(er),  std::back_inserter(dead), deads);
+
        /* assert has been resolved; clear flag */
        g[v].assert_flags &= ~POS_FLAG_MULTILINE_START;
    }
@ -251,6 +253,7 @@ void checkForMultilineStart(ReportManager &rm, NGHolder &g,

 static
 bool hasAssertVertices(const NGHolder &g) {
+    // cppcheck-suppress useStlAlgorithm
    for (auto v : vertices_range(g)) {
        int flags = g[v].assert_flags;
        if (flags & WORDBOUNDARY_FLAGS) {
--- a/src/compiler/compiler.cpp
+++ b/src/compiler/compiler.cpp
@ -417,7 +417,7 @@ void addLitExpression(NG &ng, unsigned index, const char *expression,
                           "HS_FLAG_SOM_LEFTMOST are supported in literal API.");
    }

-    if (!strcmp(expression, "")) {
+    if (expLength == 0) {
        throw CompileError("Pure literal API doesn't support empty string.");
    }

@ -443,7 +443,7 @@ bytecode_ptr<RoseEngine> generateRoseEngine(NG &ng) {
    if (!rose) {
        DEBUG_PRINTF("error building rose\n");
        assert(0);
-        return nullptr;
+        return bytecode_ptr<RoseEngine>(nullptr);
    }

    dumpReportManager(ng.rm, ng.cc.grey);
@ -478,7 +478,7 @@ hs_database_t *dbCreate(const char *in_bytecode, size_t len, u64a platform) {
    DEBUG_PRINTF("db size %zu\n", db_len);
    DEBUG_PRINTF("db platform %llx\n", platform);

-    struct hs_database *db = (struct hs_database *)hs_database_alloc(db_len);
+    struct hs_database *db = static_cast<struct hs_database *>(hs_database_alloc(db_len));
    if (hs_check_alloc(db) != HS_SUCCESS) {
        hs_database_free(db);
        return nullptr;
@ -492,7 +492,7 @@ hs_database_t *dbCreate(const char *in_bytecode, size_t len, u64a platform) {
    DEBUG_PRINTF("shift is %zu\n", shift);

    db->bytecode = offsetof(struct hs_database, bytes) - shift;
-    char *bytecode = (char *)db + db->bytecode;
+    char *bytecode = reinterpret_cast<char *>(db) + db->bytecode;
    assert(ISALIGNED_CL(bytecode));

    db->magic = HS_DB_MAGIC;
@ -525,7 +525,7 @@ struct hs_database *build(NG &ng, unsigned int *length, u8 pureFlag) {
        throw CompileError("Internal error.");
    }

-    const char *bytecode = (const char *)(rose.get());
+    const char *bytecode = reinterpret_cast<const char *>(rose.get());
    const platform_t p = target_to_platform(ng.cc.target_info);
    struct hs_database *db = dbCreate(bytecode, *length, p);
    if (!db) {
--- a/src/compiler/error.cpp
+++ b/src/compiler/error.cpp
@ -57,15 +57,14 @@ extern const hs_compile_error_t hs_badalloc = {
 namespace ue2 {

 hs_compile_error_t *generateCompileError(const string &err, int expression) {
-    hs_compile_error_t *ret =
-        (struct hs_compile_error *)hs_misc_alloc(sizeof(hs_compile_error_t));
+    hs_compile_error_t *ret = static_cast<struct hs_compile_error *>(hs_misc_alloc(sizeof(hs_compile_error_t)));
    if (ret) {
        hs_error_t e = hs_check_alloc(ret);
        if (e != HS_SUCCESS) {
            hs_misc_free(ret);
            return const_cast<hs_compile_error_t *>(&hs_badalloc);
        }
-        char *msg = (char *)hs_misc_alloc(err.size() + 1);
+        char *msg = static_cast<char *>(hs_misc_alloc(err.size() + 1));
        if (msg) {
            e = hs_check_alloc(msg);
            if (e != HS_SUCCESS) {
--- a/src/crc32.c
+++ b/src/crc32.c
@ -542,14 +542,13 @@ u32 crc32c_sb8_64_bit(u32 running_crc, const unsigned char* p_buf,

    // Main aligned loop, processes eight bytes at a time.

-    u32 term1, term2;
    for (size_t li = 0; li < running_length/8; li++) {
        u32 block = *(const u32 *)p_buf;
        crc ^= block;
        p_buf += 4;
-        term1 = crc_tableil8_o88[crc & 0x000000FF] ^
+        u32 term1 = crc_tableil8_o88[crc & 0x000000FF] ^
                crc_tableil8_o80[(crc >> 8) & 0x000000FF];
-        term2 = crc >> 16;
+        u32 term2 = crc >> 16;
        crc = term1 ^
              crc_tableil8_o72[term2 & 0x000000FF] ^
              crc_tableil8_o64[(term2 >> 8) & 0x000000FF];
--- a/src/database.h
+++ b/src/database.h
@ -79,21 +79,18 @@ static UNUSED
 const platform_t hs_current_platform_no_avx2 = {
    HS_PLATFORM_NOAVX2 |
    HS_PLATFORM_NOAVX512 |
-    HS_PLATFORM_NOAVX512VBMI |
-    0,
+    HS_PLATFORM_NOAVX512VBMI 
 };

 static UNUSED
 const platform_t hs_current_platform_no_avx512 = {
    HS_PLATFORM_NOAVX512 |
-    HS_PLATFORM_NOAVX512VBMI |
-    0,
+    HS_PLATFORM_NOAVX512VBMI
 };

 static UNUSED
 const platform_t hs_current_platform_no_avx512vbmi = {
-    HS_PLATFORM_NOAVX512VBMI |
-    0,
+    HS_PLATFORM_NOAVX512VBMI 
 };

 /*
@ -115,6 +112,7 @@ struct hs_database {

 static really_inline
 const void *hs_get_bytecode(const struct hs_database *db) {
+    // cppcheck-suppress cstyleCast
    return ((const char *)db + db->bytecode);
 }

--- a/src/dispatcher.c
+++ b/src/dispatcher.c
@ -1,5 +1,6 @@
 /*
 * Copyright (c) 2016-2020, Intel Corporation
+ * Copyright (c) 2024, VectorCamp PC
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -30,6 +31,39 @@
 #include "hs_common.h"
 #include "hs_runtime.h"
 #include "ue2common.h"
+
+/* Streamlining the dispatch to eliminate runtime checking/branching:
+ * What we want to do is, first call to the function will run the resolve
+ * code and set the static resolved/dispatch pointer to point to the
+ * correct function. Subsequent calls to the function will go directly to
+ * the resolved ptr. The simplest way to accomplish this is, to
+ * initially set the pointer to the resolve function.
+ * To accomplish this in a manner invisible to the user,
+ * we do involve some rather ugly/confusing macros in here.
+ * There are four macros that assemble the code for each function
+ * we want to dispatch in this manner:
+ * CREATE_DISPATCH
+ * this generates the declarations for the candidate target functions,
+ * for the fat_dispatch function pointer, for the resolve_ function,
+ * points the function pointer to the resolve function, and contains
+ * most of the definition of the resolve function. The very end of the
+ * resolve function is completed by the next macro, because in the
+ * CREATE_DISPATCH macro we have the argument list with the arg declarations,
+ * which is needed to generate correct function signatures, but we
+ * can't generate from this, in a macro, a _call_ to one of those functions.
+ * CONNECT_ARGS_1
+ * this macro fills in the actual call at the end of the resolve function,
+ * with the correct arg list. hence the name connect args.
+ * CONNECT_DISPATCH_2
+ * this macro likewise gives up the beginning of the definition of the
+ * actual entry point function (the 'real name' that's called by the user)
+ * but again in the pass-through call, cannot invoke the target without
+ * getting the arg list , which is supplied by the final macro,
+ * CONNECT_ARGS_3
+ *
+ */
+
+
 #if defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "util/arch/x86/cpuid_inline.h"
 #include "util/join.h"
@ -57,30 +91,38 @@
        return (RTYPE)HS_ARCH_ERROR;                                           \
    }                                                                          \
                                                                               \
-    /* resolver */                                                             \
-    static RTYPE (*JOIN(resolve_, NAME)(void))(__VA_ARGS__) {                  \
-        if (check_avx512vbmi()) {                                              \
-            return JOIN(avx512vbmi_, NAME);                                    \
-        }                                                                      \
-        if (check_avx512()) {                                                  \
-            return JOIN(avx512_, NAME);                                        \
-        }                                                                      \
-        if (check_avx2()) {                                                    \
-            return JOIN(avx2_, NAME);                                          \
-        }                                                                      \
-        if (check_sse42() && check_popcnt()) {                                 \
-            return JOIN(corei7_, NAME);                                        \
-        }                                                                      \
-        if (check_ssse3()) {                                                   \
-            return JOIN(core2_, NAME);                                         \
-        }                                                                      \
-        /* anything else is fail */                                            \
-        return JOIN(error_, NAME);                                             \
-    }                                                                          \
+    /* dispatch routing pointer for this function */                           \
+    /* initially point it at the resolve function */                           \
+    static RTYPE JOIN(resolve_, NAME)(__VA_ARGS__);                            \
+    static RTYPE (* JOIN(fat_dispatch_, NAME))(__VA_ARGS__) =                  \
+        &JOIN(resolve_, NAME);                                                 \
                                                                               \
-    /* function */                                                             \
-    HS_PUBLIC_API                                                              \
-    RTYPE NAME(__VA_ARGS__) __attribute__((ifunc("resolve_" #NAME)))
+    /* resolver */                                                             \
+    static RTYPE JOIN(resolve_, NAME)(__VA_ARGS__) {                           \
+        if (check_avx512vbmi()) {                                              \
+            fat_dispatch_ ## NAME = &JOIN(avx512vbmi_, NAME);                  \
+        }                                                                      \
+        else if (check_avx512()) {                                             \
+            fat_dispatch_ ## NAME = &JOIN(avx512_, NAME);                      \
+        }                                                                      \
+        else if (check_avx2()) {                                               \
+            fat_dispatch_ ## NAME = &JOIN(avx2_, NAME);                        \
+        }                                                                      \
+        else if (check_sse42() && check_popcnt()) {                            \
+            fat_dispatch_ ## NAME = &JOIN(corei7_, NAME);                      \
+        }                                                                      \
+        else if (check_ssse3()) {                                              \
+            fat_dispatch_ ## NAME = &JOIN(core2_, NAME);                       \
+        } else {                                                               \
+            /* anything else is fail */                                        \
+            fat_dispatch_ ## NAME = &JOIN(error_, NAME);                       \
+        }                                                                      \
+
+
+
+/* the rest of the function is completed in the CONNECT_ARGS_1 macro. */
+
+

 #elif defined(ARCH_AARCH64)
 #include "util/arch/arm/cpuid_inline.h"
@ -97,99 +139,226 @@
        return (RTYPE)HS_ARCH_ERROR;                                           \
    }                                                                          \
                                                                               \
-    /* resolver */                                                             \
-    static RTYPE (*JOIN(resolve_, NAME)(void))(__VA_ARGS__) {                  \
-        if (check_sve2()) {                                                    \
-            return JOIN(sve2_, NAME);                                          \
-        }                                                                      \
-        if (check_sve()) {                                                     \
-            return JOIN(sve_, NAME);                                           \
-        }                                                                      \
-        if (check_neon()) {                                                    \
-            return JOIN(neon_, NAME);                                          \
-        }                                                                      \
-        /* anything else is fail */                                            \
-        return JOIN(error_, NAME);                                             \
-    }                                                                          \
+    /* dispatch routing pointer for this function */                           \
+    /* initially point it at the resolve function */                           \
+    static RTYPE JOIN(resolve_, NAME)(__VA_ARGS__);                            \
+    static RTYPE (* JOIN(fat_dispatch_, NAME))(__VA_ARGS__) =                  \
+        &JOIN(resolve_, NAME);                                                 \
                                                                               \
-    /* function */                                                             \
-    HS_PUBLIC_API                                                              \
-    RTYPE NAME(__VA_ARGS__) __attribute__((ifunc("resolve_" #NAME)))
+    /* resolver */                                                             \
+    static RTYPE JOIN(resolve_, NAME)(__VA_ARGS__) {                           \
+        if (check_sve2()) {                                                    \
+            fat_dispatch_ ## NAME = &JOIN(sve2_, NAME);                        \
+        }                                                                      \
+        else if (check_sve()) {                                                \
+            fat_dispatch_ ## NAME = &JOIN(sve_, NAME);                         \
+        }                                                                      \
+        else if (check_neon()) {                                               \
+            fat_dispatch_ ## NAME = &JOIN(neon_, NAME);                        \
+        } else {                                                               \
+            /* anything else is fail */                                        \
+            fat_dispatch_ ## NAME = &JOIN(error_, NAME);                       \
+        }                                                                      \
+
+
+/* the rest of the function is completed in the CONNECT_ARGS_1 macro. */
+

 #endif

+
+#define CONNECT_ARGS_1(RTYPE, NAME, ...)                                       \
+        return (*fat_dispatch_ ## NAME)(__VA_ARGS__);                          \
+    }                                                                          \
+
+
+#define CONNECT_DISPATCH_2(RTYPE, NAME, ...)                                   \
+    /* new function */                                                         \
+    HS_PUBLIC_API                                                              \
+    RTYPE NAME(__VA_ARGS__) {                                                  \
+
+
+#define CONNECT_ARGS_3(RTYPE, NAME, ...)                                       \
+        return (*fat_dispatch_ ## NAME)(__VA_ARGS__);                          \
+    }                                                                          \
+
+
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-function"
+
+/* this gets a bit ugly to compose the static redirect functions,
+ * as we necessarily need first the typed arg list and then just the arg
+ * names, twice in a row, to define the redirect function and the
+ * dispatch function call */
+
 CREATE_DISPATCH(hs_error_t, hs_scan, const hs_database_t *db, const char *data,
                unsigned length, unsigned flags, hs_scratch_t *scratch,
                match_event_handler onEvent, void *userCtx);
+CONNECT_ARGS_1(hs_error_t, hs_scan, db, data, length, flags, scratch, onEvent, userCtx);
+CONNECT_DISPATCH_2(hs_error_t, hs_scan, const hs_database_t *db, const char *data,
+                unsigned length, unsigned flags, hs_scratch_t *scratch,
+                match_event_handler onEvent, void *userCtx);
+CONNECT_ARGS_3(hs_error_t, hs_scan, db, data, length, flags, scratch, onEvent, userCtx);

 CREATE_DISPATCH(hs_error_t, hs_stream_size, const hs_database_t *database,
                size_t *stream_size);
+CONNECT_ARGS_1(hs_error_t, hs_stream_size, database, stream_size);
+CONNECT_DISPATCH_2(hs_error_t, hs_stream_size, const hs_database_t *database,
+                size_t *stream_size);
+CONNECT_ARGS_3(hs_error_t, hs_stream_size, database, stream_size);

 CREATE_DISPATCH(hs_error_t, hs_database_size, const hs_database_t *db,
                size_t *size);
+CONNECT_ARGS_1(hs_error_t, hs_database_size, db, size);
+CONNECT_DISPATCH_2(hs_error_t, hs_database_size, const hs_database_t *db,
+                size_t *size);
+CONNECT_ARGS_3(hs_error_t, hs_database_size, db, size);
+
 CREATE_DISPATCH(hs_error_t, dbIsValid, const hs_database_t *db);
+CONNECT_ARGS_1(hs_error_t, dbIsValid, db);
+CONNECT_DISPATCH_2(hs_error_t, dbIsValid, const hs_database_t *db);
+CONNECT_ARGS_3(hs_error_t, dbIsValid, db);
+
 CREATE_DISPATCH(hs_error_t, hs_free_database, hs_database_t *db);
+CONNECT_ARGS_1(hs_error_t, hs_free_database, db);
+CONNECT_DISPATCH_2(hs_error_t, hs_free_database, hs_database_t *db);
+CONNECT_ARGS_3(hs_error_t, hs_free_database, db);

 CREATE_DISPATCH(hs_error_t, hs_open_stream, const hs_database_t *db,
                unsigned int flags, hs_stream_t **stream);
+CONNECT_ARGS_1(hs_error_t, hs_open_stream, db, flags, stream);
+CONNECT_DISPATCH_2(hs_error_t, hs_open_stream, const hs_database_t *db,
+                unsigned int flags, hs_stream_t **stream);
+CONNECT_ARGS_3(hs_error_t, hs_open_stream, db, flags, stream);

 CREATE_DISPATCH(hs_error_t, hs_scan_stream, hs_stream_t *id, const char *data,
                unsigned int length, unsigned int flags, hs_scratch_t *scratch,
                match_event_handler onEvent, void *ctxt);
+CONNECT_ARGS_1(hs_error_t, hs_scan_stream, id, data, length, flags, scratch, onEvent, ctxt);
+CONNECT_DISPATCH_2(hs_error_t, hs_scan_stream, hs_stream_t *id, const char *data,
+                unsigned int length, unsigned int flags, hs_scratch_t *scratch,
+                match_event_handler onEvent, void *ctxt);
+CONNECT_ARGS_3(hs_error_t, hs_scan_stream, id, data, length, flags, scratch, onEvent, ctxt);

 CREATE_DISPATCH(hs_error_t, hs_close_stream, hs_stream_t *id,
                hs_scratch_t *scratch, match_event_handler onEvent, void *ctxt);
+CONNECT_ARGS_1(hs_error_t, hs_close_stream, id, scratch, onEvent, ctxt);
+CONNECT_DISPATCH_2(hs_error_t, hs_close_stream, hs_stream_t *id,
+                hs_scratch_t *scratch, match_event_handler onEvent, void *ctxt);
+CONNECT_ARGS_3(hs_error_t, hs_close_stream, id, scratch, onEvent, ctxt);

 CREATE_DISPATCH(hs_error_t, hs_scan_vector, const hs_database_t *db,
                const char *const *data, const unsigned int *length,
                unsigned int count, unsigned int flags, hs_scratch_t *scratch,
                match_event_handler onevent, void *context);
+CONNECT_ARGS_1(hs_error_t, hs_scan_vector, db, data, length, count, flags, scratch, onevent, context);
+CONNECT_DISPATCH_2(hs_error_t, hs_scan_vector, const hs_database_t *db,
+                const char *const *data, const unsigned int *length,
+                unsigned int count, unsigned int flags, hs_scratch_t *scratch,
+                match_event_handler onevent, void *context);
+CONNECT_ARGS_3(hs_error_t, hs_scan_vector, db, data, length, count, flags, scratch, onevent, context);

 CREATE_DISPATCH(hs_error_t, hs_database_info, const hs_database_t *db, char **info);
+CONNECT_ARGS_1(hs_error_t, hs_database_info, db, info);
+CONNECT_DISPATCH_2(hs_error_t, hs_database_info, const hs_database_t *db, char **info);
+CONNECT_ARGS_3(hs_error_t, hs_database_info, db, info);

 CREATE_DISPATCH(hs_error_t, hs_copy_stream, hs_stream_t **to_id,
                const hs_stream_t *from_id);
+CONNECT_ARGS_1(hs_error_t, hs_copy_stream, to_id, from_id);
+CONNECT_DISPATCH_2(hs_error_t, hs_copy_stream, hs_stream_t **to_id,
+                const hs_stream_t *from_id);
+CONNECT_ARGS_3(hs_error_t, hs_copy_stream, to_id, from_id);

 CREATE_DISPATCH(hs_error_t, hs_reset_stream, hs_stream_t *id,
                unsigned int flags, hs_scratch_t *scratch,
                match_event_handler onEvent, void *context);
+CONNECT_ARGS_1(hs_error_t, hs_reset_stream, id, flags, scratch, onEvent, context);
+CONNECT_DISPATCH_2(hs_error_t, hs_reset_stream, hs_stream_t *id,
+                unsigned int flags, hs_scratch_t *scratch,
+                match_event_handler onEvent, void *context);
+CONNECT_ARGS_3(hs_error_t, hs_reset_stream, id, flags, scratch, onEvent, context);

 CREATE_DISPATCH(hs_error_t, hs_reset_and_copy_stream, hs_stream_t *to_id,
                const hs_stream_t *from_id, hs_scratch_t *scratch,
                match_event_handler onEvent, void *context);
+CONNECT_ARGS_1(hs_error_t, hs_reset_and_copy_stream, to_id, from_id, scratch, onEvent, context);
+CONNECT_DISPATCH_2(hs_error_t, hs_reset_and_copy_stream, hs_stream_t *to_id,
+                const hs_stream_t *from_id, hs_scratch_t *scratch,
+                match_event_handler onEvent, void *context);
+CONNECT_ARGS_3(hs_error_t, hs_reset_and_copy_stream, to_id, from_id, scratch, onEvent, context);

 CREATE_DISPATCH(hs_error_t, hs_serialize_database, const hs_database_t *db,
                char **bytes, size_t *length);
+CONNECT_ARGS_1(hs_error_t, hs_serialize_database, db, bytes, length);
+CONNECT_DISPATCH_2(hs_error_t, hs_serialize_database, const hs_database_t *db,
+                char **bytes, size_t *length);
+CONNECT_ARGS_3(hs_error_t, hs_serialize_database, db, bytes, length);

 CREATE_DISPATCH(hs_error_t, hs_deserialize_database, const char *bytes,
                const size_t length, hs_database_t **db);
+CONNECT_ARGS_1(hs_error_t, hs_deserialize_database, bytes, length, db);
+CONNECT_DISPATCH_2(hs_error_t, hs_deserialize_database, const char *bytes,
+                const size_t length, hs_database_t **db);
+CONNECT_ARGS_3(hs_error_t, hs_deserialize_database, bytes, length, db);

 CREATE_DISPATCH(hs_error_t, hs_deserialize_database_at, const char *bytes,
                const size_t length, hs_database_t *db);
+CONNECT_ARGS_1(hs_error_t, hs_deserialize_database_at, bytes, length, db);
+CONNECT_DISPATCH_2(hs_error_t, hs_deserialize_database_at, const char *bytes,
+                const size_t length, hs_database_t *db);
+CONNECT_ARGS_3(hs_error_t, hs_deserialize_database_at, bytes, length, db);

 CREATE_DISPATCH(hs_error_t, hs_serialized_database_info, const char *bytes,
                size_t length, char **info);
+CONNECT_ARGS_1(hs_error_t, hs_serialized_database_info, bytes, length, info);
+CONNECT_DISPATCH_2(hs_error_t, hs_serialized_database_info, const char *bytes,
+                size_t length, char **info);
+CONNECT_ARGS_3(hs_error_t, hs_serialized_database_info, bytes, length, info);

 CREATE_DISPATCH(hs_error_t, hs_serialized_database_size, const char *bytes,
                const size_t length, size_t *deserialized_size);
+CONNECT_ARGS_1(hs_error_t, hs_serialized_database_size, bytes, length, deserialized_size);
+CONNECT_DISPATCH_2(hs_error_t, hs_serialized_database_size, const char *bytes,
+                const size_t length, size_t *deserialized_size);
+CONNECT_ARGS_3(hs_error_t, hs_serialized_database_size, bytes, length, deserialized_size);

 CREATE_DISPATCH(hs_error_t, hs_compress_stream, const hs_stream_t *stream,
                char *buf, size_t buf_space, size_t *used_space);
+CONNECT_ARGS_1(hs_error_t, hs_compress_stream, stream,
+                buf, buf_space, used_space);
+CONNECT_DISPATCH_2(hs_error_t, hs_compress_stream, const hs_stream_t *stream,
+                char *buf, size_t buf_space, size_t *used_space);
+CONNECT_ARGS_3(hs_error_t, hs_compress_stream, stream,
+                buf, buf_space, used_space);

 CREATE_DISPATCH(hs_error_t, hs_expand_stream, const hs_database_t *db,
                hs_stream_t **stream, const char *buf,size_t buf_size);
+CONNECT_ARGS_1(hs_error_t, hs_expand_stream, db, stream, buf,buf_size);
+CONNECT_DISPATCH_2(hs_error_t, hs_expand_stream, const hs_database_t *db,
+                hs_stream_t **stream, const char *buf,size_t buf_size);
+CONNECT_ARGS_3(hs_error_t, hs_expand_stream, db, stream, buf,buf_size);

 CREATE_DISPATCH(hs_error_t, hs_reset_and_expand_stream, hs_stream_t *to_stream,
                const char *buf, size_t buf_size, hs_scratch_t *scratch,
                match_event_handler onEvent, void *context);
+CONNECT_ARGS_1(hs_error_t, hs_reset_and_expand_stream, to_stream,
+                buf, buf_size, scratch, onEvent, context);
+CONNECT_DISPATCH_2(hs_error_t, hs_reset_and_expand_stream, hs_stream_t *to_stream,
+                const char *buf, size_t buf_size, hs_scratch_t *scratch,
+                match_event_handler onEvent, void *context);
+CONNECT_ARGS_3(hs_error_t, hs_reset_and_expand_stream, to_stream,
+                buf, buf_size, scratch, onEvent, context);

 /** INTERNALS **/

 CREATE_DISPATCH(u32, Crc32c_ComputeBuf, u32 inCrc32, const void *buf, size_t bufLen);
+CONNECT_ARGS_1(u32, Crc32c_ComputeBuf, inCrc32, buf, bufLen);
+CONNECT_DISPATCH_2(u32, Crc32c_ComputeBuf, u32 inCrc32, const void *buf, size_t bufLen);
+CONNECT_ARGS_3(u32, Crc32c_ComputeBuf, inCrc32, buf, bufLen);

 #pragma GCC diagnostic pop
 #pragma GCC diagnostic pop
+
--- a/src/fdr/fdr.c
+++ b/src/fdr/fdr.c
@ -298,7 +298,7 @@ void get_conf_stride_4(const u8 *itPtr, UNUSED const u8 *start_ptr,
 static really_inline
 void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
                    const u32 *confBase, const struct FDR_Runtime_Args *a,
-                    const u8 *ptr, u32 *last_match_id, struct zone *z) {
+                    const u8 *ptr, u32 *last_match_id, const struct zone *z) {
    const u8 bucket = 8;

    if (likely(!*conf)) {
@ -308,7 +308,7 @@ void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
    /* ptr is currently referring to a location in the zone's buffer, we also
     * need a pointer in the original, main buffer for the final string compare.
     */
-    const u8 *ptr_main = (const u8 *)((uintptr_t)ptr + z->zone_pointer_adjust);
+    const u8 *ptr_main = (const u8 *)((uintptr_t)ptr + z->zone_pointer_adjust); //NOLINT (performance-no-int-to-ptr)

    const u8 *confLoc = ptr;

@ -333,7 +333,7 @@ void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
 }

 static really_inline
-void dumpZoneInfo(UNUSED struct zone *z, UNUSED size_t zone_id) {
+void dumpZoneInfo(UNUSED const struct zone *z, UNUSED size_t zone_id) {
 #ifdef DEBUG
    DEBUG_PRINTF("zone: zone=%zu, bufPtr=%p\n", zone_id, z->buf);
    DEBUG_PRINTF("zone: startPtr=%p, endPtr=%p, shift=%u\n",
--- a/src/fdr/fdr_compile.cpp
+++ b/src/fdr/fdr_compile.cpp
@ -127,7 +127,7 @@ void andMask(u8 *dest, const u8 *a, const u8 *b, u32 num_bytes) {
 }

 void FDRCompiler::createInitialState(FDR *fdr) {
-    u8 *start = (u8 *)&fdr->start;
+    u8 *start = reinterpret_cast<u8 *>(&fdr->start);

    /* initial state should to be 1 in each slot in the bucket up to bucket
     * minlen - 1, and 0 thereafter */
@ -136,6 +136,7 @@ void FDRCompiler::createInitialState(FDR *fdr) {
        const vector<LiteralIndex> &bucket_lits = bucketToLits[b];
        u32 min_len = ~0U;
        for (const LiteralIndex &lit_idx : bucket_lits) {		
+            // cppcheck-suppress useStlAlgorithm		
            min_len = min(min_len, verify_u32(lits[lit_idx].s.length()));		
        }

@ -175,7 +176,7 @@ bytecode_ptr<FDR> FDRCompiler::setupFDR() {
    auto fdr = make_zeroed_bytecode_ptr<FDR>(size, 64);
    assert(fdr); // otherwise would have thrown std::bad_alloc

-    u8 *fdr_base = (u8 *)fdr.get();
+    u8 *fdr_base = reinterpret_cast<u8 *>(fdr.get());

    // Write header.
    fdr->size = size;
@ -205,7 +206,6 @@ bytecode_ptr<FDR> FDRCompiler::setupFDR() {
    assert(ISALIGNED_CL(ptr));
    fdr->floodOffset = verify_u32(ptr - fdr_base);
    memcpy(ptr, floodTable.get(), floodTable.size());
-    ptr += floodTable.size(); // last write, no need to round up
    
    return fdr;
 }
--- a/src/fdr/fdr_compile_util.cpp
+++ b/src/fdr/fdr_compile_util.cpp
@ -39,6 +39,7 @@ namespace ue2 {
 size_t maxLen(const vector<hwlmLiteral> &lits) {
    size_t rv = 0;
    for (const auto &lit : lits) {
+        // cppcheck-suppress useStlAlgorithm
        rv = max(rv, lit.s.size());
    }
    return rv;
--- a/src/fdr/fdr_confirm.h
+++ b/src/fdr/fdr_confirm.h
@ -84,9 +84,10 @@ struct FDRConfirm {

 static really_inline
 const u32 *getConfirmLitIndex(const struct FDRConfirm *fdrc) {
+    // cppcheck-suppress cstyleCast
    const u8 *base = (const u8 *)fdrc;
-    const u32 *litIndex =
-        (const u32 *)(base + ROUNDUP_N(sizeof(*fdrc), alignof(u32)));
+    // cppcheck-suppress cstyleCast
+    const u32 *litIndex =(const u32 *)(base + ROUNDUP_N(sizeof(*fdrc), alignof(u32)));
    assert(ISALIGNED(litIndex));
    return litIndex;
 }
--- a/src/fdr/fdr_confirm_compile.cpp
+++ b/src/fdr/fdr_confirm_compile.cpp
@ -58,7 +58,7 @@ u64a make_u64a_mask(const vector<u8> &v) {
    u64a mask = 0;
    size_t vlen = v.size();
    size_t len = std::min(vlen, sizeof(mask));
-    unsigned char *m = (unsigned char *)&mask;
+    u8 *m = reinterpret_cast<u8 *>(&mask);
    memcpy(m + sizeof(mask) - len, &v[vlen - len], len);
    return mask;
 }
@ -159,7 +159,7 @@ bytecode_ptr<FDRConfirm> getFDRConfirm(const vector<hwlmLiteral> &lits,
    map<u32, vector<LiteralIndex> > res2lits;
    hwlm_group_t gm = 0;
    for (LiteralIndex i = 0; i < lits.size(); i++) {
-        LitInfo & li = tmpLitInfo[i];
+        const LitInfo & li = tmpLitInfo[i];
        u32 hash = CONF_HASH_CALL(li.v, andmsk, mult, nBits);
        DEBUG_PRINTF("%016llx --> %u\n", li.v, hash);
        res2lits[hash].emplace_back(i);
@ -245,10 +245,10 @@ bytecode_ptr<FDRConfirm> getFDRConfirm(const vector<hwlmLiteral> &lits,
    fdrc->groups = gm;

    // After the FDRConfirm, we have the lit index array.
-    u8 *fdrc_base = (u8 *)fdrc.get();
+    u8 *fdrc_base = reinterpret_cast<u8 *>(fdrc.get());
    u8 *ptr = fdrc_base + sizeof(*fdrc);
    ptr = ROUNDUP_PTR(ptr, alignof(u32));
-    u32 *bitsToLitIndex = (u32 *)ptr;
+    u32 *bitsToLitIndex = reinterpret_cast<u32 *>(ptr);
    ptr += bitsToLitIndexSize;

    // After the lit index array, we have the LitInfo structures themselves,
@ -265,7 +265,7 @@ bytecode_ptr<FDRConfirm> getFDRConfirm(const vector<hwlmLiteral> &lits,
            LiteralIndex litIdx = *i;

            // Write LitInfo header.
-            LitInfo &finalLI = *(LitInfo *)ptr;
+            LitInfo &finalLI = *(reinterpret_cast<LitInfo *>(ptr));
            finalLI = tmpLitInfo[litIdx];

            ptr += sizeof(LitInfo); // String starts directly after LitInfo.
@ -294,15 +294,13 @@ setupFullConfs(const vector<hwlmLiteral> &lits,
               const EngineDescription &eng,
               const map<BucketIndex, vector<LiteralIndex>> &bucketToLits,
               bool make_small) {
-    unique_ptr<TeddyEngineDescription> teddyDescr =
-        getTeddyDescription(eng.getID());
-
    BC2CONF bc2Conf;
    u32 totalConfirmSize = 0;
    for (BucketIndex b = 0; b < eng.getNumBuckets(); b++) {
        if (contains(bucketToLits, b)) {
            vector<hwlmLiteral> vl;
            for (const LiteralIndex &lit_idx : bucketToLits.at(b)) {
+                // cppcheck-suppress useStlAlgorithm
                vl.emplace_back(lits[lit_idx]);
            }

@ -320,7 +318,7 @@ setupFullConfs(const vector<hwlmLiteral> &lits,
    auto buf = make_zeroed_bytecode_ptr<u8>(totalSize, 64);
    assert(buf); // otherwise would have thrown std::bad_alloc

-    u32 *confBase = (u32 *)buf.get();
+    u32 *confBase = reinterpret_cast<u32 *>(buf.get());
    u8 *ptr = buf.get() + totalConfSwitchSize;
    assert(ISALIGNED_CL(ptr));

--- a/src/fdr/fdr_confirm_runtime.h
+++ b/src/fdr/fdr_confirm_runtime.h
@ -1,5 +1,6 @@
 /*
 * Copyright (c) 2015-2019, Intel Corporation
+ * Copyright (c) 2024, VectorCamp PC
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -54,9 +55,14 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a
    if (likely(!start)) {
        return;
    }
-
+// these cplusplus checks are needed because this is included in both fdr.c and teddy.cpp
+#ifdef __cplusplus
+    const struct LitInfo *li
+        = reinterpret_cast<const struct LitInfo *>(reinterpret_cast<const u8 *>(fdrc) + start);
+#else
    const struct LitInfo *li
        = (const struct LitInfo *)((const u8 *)fdrc + start);
+#endif

    struct hs_scratch *scratch = a->scratch;
    assert(!scratch->fdr_conf);
@ -74,18 +80,20 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a
            goto out;
        }

-        const u8 *loc = buf + i - li->size + 1;
+        do{  // this do while is to block off the line below from the goto
+            const u8 *loc = buf + i - li->size + 1;
        
-        if (loc < buf) {
-            u32 full_overhang = buf - loc;
-            size_t len_history = a->len_history;
+            if (loc < buf) {
+                u32 full_overhang = buf - loc;
+                size_t len_history = a->len_history;

-            // can't do a vectored confirm either if we don't have
-            // the bytes
-            if (full_overhang > len_history) {
-                goto out;
+                // can't do a vectored confirm either if we don't have
+                // the bytes
+                if (full_overhang > len_history) {
+                    goto out;
+                }
            }
-        }
+        }while(0);
        assert(li->size <= sizeof(CONF_TYPE));

        if (unlikely(!(li->groups & *control))) {
--- a/src/fdr/fdr_dump.cpp
+++ b/src/fdr/fdr_dump.cpp
@ -74,9 +74,9 @@ void dumpLitIndex(const FDRConfirm *fdrc, FILE *f) {
 static
 void dumpConfirms(const void *fdr_base, u32 conf_offset, u32 num_confirms,
                  FILE *f) {
-    const u32 *conf = (const u32 *)((const char *)fdr_base + conf_offset);
+    const u32 *conf = reinterpret_cast<const u32 *>(reinterpret_cast<const char *>(fdr_base) + conf_offset);
    for (u32 i = 0; i < num_confirms; i++) {
-        const auto *fdrc = (const FDRConfirm *)((const char *)conf + conf[i]);
+        const auto *fdrc = reinterpret_cast<const FDRConfirm *>(reinterpret_cast<const char *>(conf) + conf[i]);
        fprintf(f, "    confirm %u\n", i);
        fprintf(f, "      andmsk  0x%016llx\n", fdrc->andmsk);
        fprintf(f, "      mult    0x%016llx\n", fdrc->mult);
@ -113,7 +113,7 @@ void dumpTeddyDupMasks(const u8 *dmsk, u32 numMasks, FILE *f) {
    u32 maskWidth = 2;
    fprintf(f, "    dup nibble masks:\n");
    for (u32 i = 0; i < numMasks * 2; i++) {
-        fprintf(f, "      -%d%s: ", 1 + i / 2, (i % 2) ? "hi" : "lo");
+        fprintf(f, "      -%u%s: ", 1 + i / 2, (i % 2) ? "hi" : "lo");
        for (u32 j = 0; j < 16 * maskWidth * 2; j++) {
            u8 val = dmsk[i * 16 * maskWidth * 2 + j];
            for (u32 k = 0; k < 8; k++) {
@ -131,7 +131,7 @@ void dumpTeddyMasks(const u8 *baseMsk, u32 numMasks, u32 maskWidth, FILE *f) {
    // dump nibble masks
    fprintf(f, "    nibble masks:\n");
    for (u32 i = 0; i < numMasks * 2; i++) {
-        fprintf(f, "      -%d%s: ", 1 + i / 2, (i % 2) ? "hi" : "lo");
+        fprintf(f, "      -%u%s: ", 1 + i / 2, (i % 2) ? "hi" : "lo");
        for (u32 j = 0; j < 16 * maskWidth; j++) {
            u8 val = baseMsk[i * 16 * maskWidth + j];
            for (u32 k = 0; k < 8; k++) {
@ -157,7 +157,7 @@ void dumpTeddy(const Teddy *teddy, FILE *f) {
    fprintf(f, "    buckets    %u\n", des->getNumBuckets());
    fprintf(f, "    packed     %s\n", des->packed ? "true" : "false");
    fprintf(f, "    strings    %u\n", teddy->numStrings);
-    fprintf(f, "    size       %zu bytes\n", fdrSize((const FDR *)teddy));
+    fprintf(f, "    size       %zu bytes\n", fdrSize(reinterpret_cast<const FDR *>(teddy)));
    fprintf(f, "    max length %u\n", teddy->maxStringLen);
    fprintf(f, "    floodoff   %u (%x)\n", teddy->floodOffset,
            teddy->floodOffset);
@ -165,7 +165,7 @@ void dumpTeddy(const Teddy *teddy, FILE *f) {

    u32 maskWidth = des->getNumBuckets() / 8;
    size_t headerSize = sizeof(Teddy);
-    const u8 *teddy_base = (const u8 *)teddy;
+    const u8 *teddy_base = reinterpret_cast<const u8 *>(teddy);
    const u8 *baseMsk = teddy_base + ROUNDUP_CL(headerSize);
    dumpTeddyMasks(baseMsk, des->numMasks, maskWidth, f);
    size_t maskLen = des->numMasks * 16 * 2 * maskWidth;
@ -201,7 +201,7 @@ void dumpFDR(const FDR *fdr, FILE *f) {

 void fdrPrintStats(const FDR *fdr, FILE *f) {
    if (fdrIsTeddy(fdr)) {
-        dumpTeddy((const Teddy *)fdr, f);
+        dumpTeddy(reinterpret_cast<const Teddy *>(fdr), f);
    } else {
        dumpFDR(fdr, f);
    }
--- a/src/fdr/fdr_engine_description.cpp
+++ b/src/fdr/fdr_engine_description.cpp
@ -71,7 +71,7 @@ u32 findDesiredStride(size_t num_lits, size_t min_len, size_t min_len_count) {
        } else if (num_lits < 5000) {
            // for larger but not huge sizes, go to stride 2 only if we have at
            // least minlen 3
-            desiredStride = MIN(min_len - 1, 2);
+            desiredStride = std::min(min_len - 1, 2UL);
        }
    }

--- a/src/fdr/flood_compile.cpp
+++ b/src/fdr/flood_compile.cpp
@ -208,8 +208,8 @@ bytecode_ptr<u8> setupFDRFloodControl(const vector<hwlmLiteral> &lits,
    auto buf = make_zeroed_bytecode_ptr<u8>(totalSize, 16);
    assert(buf); // otherwise would have thrown std::bad_alloc

-    u32 *floodHeader = (u32 *)buf.get();
-    FDRFlood *layoutFlood = (FDRFlood *)(buf.get() + floodHeaderSize);
+    u32 *floodHeader = reinterpret_cast<u32 *>(buf.get());
+    FDRFlood *layoutFlood = reinterpret_cast<FDRFlood *>(buf.get() + floodHeaderSize);

    u32 currentFloodIndex = 0;
    for (const auto &m : flood2chars) {
--- a/src/fdr/flood_runtime.h
+++ b/src/fdr/flood_runtime.h
@ -1,5 +1,6 @@
 /*
 * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2024, VectorCamp PC
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -37,6 +38,13 @@
 #define FLOOD_MINIMUM_SIZE 256
 #define FLOOD_BACKOFF_START 32

+// this is because this file is included in both fdr.c and teddy.cpp
+#if defined __cplusplus
+#define CU64A_P_CAST(X) reinterpret_cast<const u64a*>(X)
+#else
+#define CU64A_P_CAST(X) (const u64a *)(X)
+#endif
+
 static really_inline
 const u8 * nextFloodDetect(const u8 * buf, size_t len, u32 floodBackoff) {
    // if we don't have a flood at either the start or end,
@ -47,18 +55,18 @@ const u8 * nextFloodDetect(const u8 * buf, size_t len, u32 floodBackoff) {

    /* entry points in runtime.c prefetch relevant data */
 #ifndef FLOOD_32
-    u64a x11 = *(const u64a *)ROUNDUP_PTR(buf, 8);
-    u64a x12 = *(const u64a *)ROUNDUP_PTR(buf+8, 8);
+    u64a x11 = *CU64A_P_CAST(ROUNDUP_PTR(buf, 8));
+    u64a x12 = *CU64A_P_CAST(ROUNDUP_PTR(buf+8, 8));
    if (x11 == x12) {
        return buf + floodBackoff;
    }
-    u64a x21 = *(const u64a *)ROUNDUP_PTR(buf + len/2, 8);
-    u64a x22 = *(const u64a *)ROUNDUP_PTR(buf + len/2 + 8, 8);
+    u64a x21 = *CU64A_P_CAST(ROUNDUP_PTR(buf + len/2, 8));
+    u64a x22 = *CU64A_P_CAST(ROUNDUP_PTR(buf + len/2 + 8, 8));
    if (x21 == x22) {
        return buf + floodBackoff;
    }
-    u64a x31 = *(const u64a *)ROUNDUP_PTR(buf + len - 24, 8);
-    u64a x32 = *(const u64a *)ROUNDUP_PTR(buf + len - 16, 8);
+    u64a x31 = *CU64A_P_CAST(ROUNDUP_PTR(buf + len - 24, 8));
+    u64a x32 = *CU64A_P_CAST(ROUNDUP_PTR(buf + len - 16, 8));
    if (x31 == x32) {
        return buf + floodBackoff;
    }
@ -106,9 +114,15 @@ const u8 * floodDetect(const struct FDR * fdr,

    // go from c to our FDRFlood structure
    u8 c = buf[i];
+#ifdef __cplusplus
+    const u8 * fBase = (reinterpret_cast<const u8 *>(fdr)) + fdr->floodOffset;
+    u32 fIdx = (reinterpret_cast<const u32 *>(fBase))[c];
+    const struct FDRFlood * fsb = reinterpret_cast<const struct FDRFlood *>(fBase + sizeof(u32) * 256);
+#else
    const u8 * fBase = ((const u8 *)fdr) + fdr->floodOffset;
    u32 fIdx = ((const u32 *)fBase)[c];
    const struct FDRFlood * fsb = (const struct FDRFlood *)(fBase + sizeof(u32) * 256);
+#endif
    const struct FDRFlood * fl = &fsb[fIdx];

 #ifndef FLOOD_32
@ -116,7 +130,7 @@ const u8 * floodDetect(const struct FDR * fdr,
    cmpVal |= cmpVal << 8;
    cmpVal |= cmpVal << 16;
    cmpVal |= cmpVal << 32;
-    u64a probe = *(const u64a *)ROUNDUP_PTR(buf+i, 8);
+    u64a probe = *CU64A_P_CAST(ROUNDUP_PTR(buf+i, 8));
 #else
    u32 cmpVal = c;
    cmpVal |= cmpVal << 8;
@ -139,16 +153,16 @@ const u8 * floodDetect(const struct FDR * fdr,
 #ifndef FLOOD_32
    j -= (u32)((uintptr_t)buf + j) & 0x7; // push j back to yield 8-aligned addrs
    for (; j + 32 < mainLoopLen; j += 32) {
-        u64a v = *(const u64a *)(buf + j);
-        u64a v2 = *(const u64a *)(buf + j + 8);
-        u64a v3 = *(const u64a *)(buf + j + 16);
-        u64a v4 = *(const u64a *)(buf + j + 24);
+        u64a v = *CU64A_P_CAST(buf + j);
+        u64a v2 = *CU64A_P_CAST(buf + j + 8);
+        u64a v3 = *CU64A_P_CAST(buf + j + 16);
+        u64a v4 = *CU64A_P_CAST(buf + j + 24);
        if ((v4 != cmpVal) || (v3 != cmpVal) || (v2 != cmpVal) || (v != cmpVal)) {
            break;
        }
    }
    for (; j + 8 < mainLoopLen; j += 8) {
-        u64a v = *(const u64a *)(buf + j);
+        u64a v = *CU64A_P_CAST(buf + j);
        if (v != cmpVal) {
            break;
        }
@ -172,7 +186,11 @@ const u8 * floodDetect(const struct FDR * fdr,
    }
 #endif
    for (; j < mainLoopLen; j++) {
+#ifdef __cplusplus
+        u8 v = *(reinterpret_cast<const u8 *>(buf + j));
+#else
        u8 v = *(const u8 *)(buf + j);
+#endif
        if (v != c) {
            break;
        }
--- a/src/fdr/teddy.c
+++ b/src/fdr/teddy.c
--- a/src/fdr/teddy.cpp
+++ b/src/fdr/teddy.cpp
@ -0,0 +1,862 @@
+/*
+ * Copyright (c) 2015-2020, Intel Corporation
+ * Copyright (c) 2024, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Teddy literal matcher: SSSE3 engine runtime.
+ */
+
+#include "fdr_internal.h"
+#include "flood_runtime.h"
+#include "teddy.h"
+#include "teddy_internal.h"
+#include "teddy_runtime_common.h"
+#include "util/arch.h"
+#include "util/simd_utils.h"
+
+
+#ifdef ARCH_64_BIT
+static really_inline
+hwlm_error_t conf_chunk_64(u64a chunk, u8 bucket, u8 offset,
+                           CautionReason reason, const u8 *pt,
+                           const u32* confBase,
+                           const struct FDR_Runtime_Args *a,
+                           hwlm_group_t *control,
+                           u32 *last_match) {
+    if (unlikely(chunk != ones_u64a)) {
+        chunk = ~chunk;
+        do_confWithBit_teddy(&chunk, bucket, offset, confBase, reason, a, pt,
+                control, last_match);
+        // adapted from CHECK_HWLM_TERMINATE_MATCHING
+        if (unlikely(*control == HWLM_TERMINATE_MATCHING)) {
+            return HWLM_TERMINATED;
+        }
+
+    }
+    return HWLM_SUCCESS;
+}
+
+#define CONF_CHUNK_64(chunk, bucket, off, reason, pt, confBase, a, control, last_match) \
+ if(conf_chunk_64(chunk, bucket, off, reason, pt, confBase, a, control, last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
+
+#else // 32/64
+
+static really_inline
+hwlm_error_t conf_chunk_32(u32 chunk, u8 bucket, u8 offset,
+                           CautionReason reason, const u8 *pt,
+                           const u32* confBase,
+                           const struct FDR_Runtime_Args *a,
+                           hwlm_group_t *control,
+                           u32 *last_match) {
+    if (unlikely(chunk != ones_u32)) {
+        chunk = ~chunk;
+        do_confWithBit_teddy(&chunk, bucket, offset, confBase, reason, a, pt,
+                control, last_match);
+        // adapted from CHECK_HWLM_TERMINATE_MATCHING
+        if (unlikely(*control == HWLM_TERMINATE_MATCHING)) {
+            return HWLM_TERMINATED;
+        }
+    }
+    return HWLM_SUCCESS;
+}
+
+#define CONF_CHUNK_32(chunk, bucket, off, reason, pt, confBase, a, control, last_match) \
+ if(conf_chunk_32(chunk, bucket, off, reason, pt, confBase, a, control, last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
+
+#endif
+
+#if defined(HAVE_AVX512VBMI) || defined(HAVE_AVX512) // common to both 512b's
+
+static really_inline
+const m512 *getDupMaskBase(const struct Teddy *teddy, u8 numMask) {
+    return (const m512 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy))
+                          + ROUNDUP_CL(2 * numMask * sizeof(m256)));
+}
+
+
+#ifdef ARCH_64_BIT
+
+static really_inline
+hwlm_error_t confirm_teddy_64_512(m512 var, u8 bucket, u8 offset,
+                                  CautionReason reason, const u8 *ptr,
+                                  const struct FDR_Runtime_Args *a,
+                                  const u32* confBase, hwlm_group_t *control,
+                                  u32 *last_match) {
+    if (unlikely(diff512(var, ones512()))) {
+        m128 p128_0 = extract128from512(var, 0);
+        m128 p128_1 = extract128from512(var, 1);
+        m128 p128_2 = extract128from512(var, 2);
+        m128 p128_3 = extract128from512(var, 3);
+        u64a part1 = movq(p128_0);
+        u64a part2 = movq(rshiftbyte_m128(p128_0, 8));
+        u64a part3 = movq(p128_1);
+        u64a part4 = movq(rshiftbyte_m128(p128_1, 8));
+        u64a part5 = movq(p128_2);
+        u64a part6 = movq(rshiftbyte_m128(p128_2, 8));
+        u64a part7 = movq(p128_3);
+        u64a part8 = movq(rshiftbyte_m128(p128_3, 8));
+        CONF_CHUNK_64(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_64(part2, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_64(part3, bucket, offset + 16, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_64(part4, bucket, offset + 24, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_64(part5, bucket, offset + 32, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_64(part6, bucket, offset + 40, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_64(part7, bucket, offset + 48, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_64(part8, bucket, offset + 56, reason, ptr, confBase, a, control, last_match);
+    }
+    return HWLM_SUCCESS;
+}
+
+#define confirm_teddy_512_f confirm_teddy_64_512
+
+#else // 32/64
+
+static really_inline
+hwlm_error_t confirm_teddy_32_512(m512 var, u8 bucket, u8 offset,
+                                  CautionReason reason, const u8 *ptr,
+                                  const struct FDR_Runtime_Args *a,
+                                  const u32* confBase, hwlm_group_t *control,
+                                  u32 *last_match) {
+    if (unlikely(diff512(var, ones512()))) {
+        m128 p128_0 = extract128from512(var, 0);
+        m128 p128_1 = extract128from512(var, 1);
+        m128 p128_2 = extract128from512(var, 2);
+        m128 p128_3 = extract128from512(var, 3);
+        u32 part1 = movd(p128_0);
+        u32 part2 = movd(rshiftbyte_m128(p128_0, 4));
+        u32 part3 = movd(rshiftbyte_m128(p128_0, 8));
+        u32 part4 = movd(rshiftbyte_m128(p128_0, 12));
+        u32 part5 = movd(p128_1);
+        u32 part6 = movd(rshiftbyte_m128(p128_1, 4));
+        u32 part7 = movd(rshiftbyte_m128(p128_1, 8));
+        u32 part8 = movd(rshiftbyte_m128(p128_1, 12));
+        u32 part9 = movd(p128_2);
+        u32 part10 = movd(rshiftbyte_m128(p128_2, 4));
+        u32 part11 = movd(rshiftbyte_m128(p128_2, 8));
+        u32 part12 = movd(rshiftbyte_m128(p128_2, 12));
+        u32 part13 = movd(p128_3);
+        u32 part14 = movd(rshiftbyte_m128(p128_3, 4));
+        u32 part15 = movd(rshiftbyte_m128(p128_3, 8));
+        u32 part16 = movd(rshiftbyte_m128(p128_3, 12));
+        CONF_CHUNK_32(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part2, bucket, offset + 4, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part3, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part4, bucket, offset + 12, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part5, bucket, offset + 16, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part6, bucket, offset + 20, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part7, bucket, offset + 24, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part8, bucket, offset + 28, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part9, bucket, offset + 32, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part10, bucket, offset + 36, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part11, bucket, offset + 40, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part12, bucket, offset + 44, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part13, bucket, offset + 48, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part14, bucket, offset + 52, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part15, bucket, offset + 56, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part16, bucket, offset + 60, reason, ptr, confBase, a, control, last_match);
+    }
+    return HWLM_SUCCESS;
+}
+
+#define confirm_teddy_512_f confirm_teddy_32_512
+
+
+#endif // 32/64
+
+#define CONFIRM_TEDDY_512(...) if(confirm_teddy_512_f(__VA_ARGS__, a, confBase, &control, &last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
+
+#endif // AVX512VBMI or AVX512
+
+
+#if defined(HAVE_AVX512VBMI) // VBMI strong teddy
+
+#define TEDDY_VBMI_SL1_MASK   0xfffffffffffffffeULL
+#define TEDDY_VBMI_SL2_MASK   0xfffffffffffffffcULL
+#define TEDDY_VBMI_SL3_MASK   0xfffffffffffffff8ULL
+
+template<int NMSK>
+static really_inline
+m512 prep_conf_teddy_512vbmi_templ(const m512 *lo_mask, const m512 *dup_mask,
+                                   const m512 *sl_msk, const m512 val) {
+    m512 lo = and512(val, *lo_mask);
+    m512 hi = and512(rshift64_m512(val, 4), *lo_mask);
+    m512 shuf_or_b0 = or512(pshufb_m512(dup_mask[0], lo),
+                            pshufb_m512(dup_mask[1], hi));
+
+    if constexpr (NMSK == 1) return shuf_or_b0;
+    m512 shuf_or_b1 = or512(pshufb_m512(dup_mask[2], lo),
+                            pshufb_m512(dup_mask[3], hi));
+    m512 sl1 = maskz_vpermb512(TEDDY_VBMI_SL1_MASK, sl_msk[0], shuf_or_b1);
+    if constexpr (NMSK == 2) return (or512(sl1, shuf_or_b0));
+    m512 shuf_or_b2 = or512(pshufb_m512(dup_mask[4], lo),
+                            pshufb_m512(dup_mask[5], hi));
+    m512 sl2 = maskz_vpermb512(TEDDY_VBMI_SL2_MASK, sl_msk[1], shuf_or_b2);
+    if constexpr (NMSK == 3) return (or512(sl2, or512(sl1, shuf_or_b0)));
+    m512 shuf_or_b3 = or512(pshufb_m512(dup_mask[6], lo),
+                            pshufb_m512(dup_mask[7], hi));
+    m512 sl3 = maskz_vpermb512(TEDDY_VBMI_SL3_MASK, sl_msk[2], shuf_or_b3);
+    return (or512(sl3, or512(sl2, or512(sl1, shuf_or_b0))));
+}
+
+
+#define TEDDY_VBMI_SL1_POS    15
+#define TEDDY_VBMI_SL2_POS    14
+#define TEDDY_VBMI_SL3_POS    13
+
+#define TEDDY_VBMI_CONF_MASK_HEAD   (0xffffffffffffffffULL >> n_sh)
+#define TEDDY_VBMI_CONF_MASK_FULL   (0xffffffffffffffffULL << n_sh)
+#define TEDDY_VBMI_CONF_MASK_VAR(n) (0xffffffffffffffffULL >> (64 - n) << overlap)
+#define TEDDY_VBMI_LOAD_MASK_PATCH  (0xffffffffffffffffULL >> (64 - n_sh))
+
+template<int NMSK>
+hwlm_error_t fdr_exec_teddy_512vbmi_templ(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
+    const u8 *buf_end = a->buf + a->len;
+    const u8 *ptr = a->buf + a->start_offset;
+    u32 floodBackoff = FLOOD_BACKOFF_START;
+    const u8 *tryFloodDetect = a->firstFloodDetect;
+    u32 last_match = ones_u32;
+    const struct Teddy *teddy = (const struct Teddy *)fdr;
+    const size_t iterBytes = 64;
+    u32 n_sh = NMSK - 1;
+    const size_t loopBytes = 64 - n_sh;
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+                 a->buf, a->len, a->start_offset);
+
+    const m128 *maskBase = getMaskBase(teddy);
+
+    m512 lo_mask = set1_64x8(0xf);
+    m512 dup_mask[NMSK * 2];
+    m512 sl_msk[NMSK - 1];
+    dup_mask[0] = set1_4x128(maskBase[0]);
+    dup_mask[1] = set1_4x128(maskBase[1]);
+    if constexpr (NMSK > 1){
+    dup_mask[2] = set1_4x128(maskBase[2]);
+    dup_mask[3] = set1_4x128(maskBase[3]);
+    sl_msk[0] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL1_POS);
+    }
+    if constexpr (NMSK > 2){
+    dup_mask[4] = set1_4x128(maskBase[4]);
+    dup_mask[5] = set1_4x128(maskBase[5]);
+    sl_msk[1] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL2_POS);
+    }
+    if constexpr (NMSK > 3){
+    dup_mask[6] = set1_4x128(maskBase[6]);
+    dup_mask[7] = set1_4x128(maskBase[7]);
+    sl_msk[2] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL3_POS);
+    }
+    const u32 *confBase = getConfBase(teddy);
+
+    u64a k = TEDDY_VBMI_CONF_MASK_FULL;
+    m512 p_mask = set_mask_m512(~k);
+    u32 overlap = 0;
+    u64a patch = 0;
+    if (likely(ptr + loopBytes <= buf_end)) {
+        m512 p_mask0 = set_mask_m512(~TEDDY_VBMI_CONF_MASK_HEAD);
+        m512 r_0 = prep_conf_teddy_512vbmi_templ<NMSK>(&lo_mask, dup_mask, sl_msk, loadu512(ptr));
+        r_0 = or512(r_0, p_mask0);
+        CONFIRM_TEDDY_512(r_0, 8, 0, VECTORING, ptr);
+        ptr += loopBytes;
+        overlap = n_sh;
+        patch = TEDDY_VBMI_LOAD_MASK_PATCH;
+    }
+
+    for (; ptr + loopBytes <= buf_end; ptr += loopBytes) {
+        __builtin_prefetch(ptr - n_sh + (64 * 2));
+        CHECK_FLOOD;
+        m512 r_0 = prep_conf_teddy_512vbmi_templ<NMSK>(&lo_mask, dup_mask, sl_msk, loadu512(ptr - n_sh));
+        r_0 = or512(r_0, p_mask);
+        CONFIRM_TEDDY_512(r_0, 8, 0, NOT_CAUTIOUS, ptr - n_sh);
+    }
+
+    assert(ptr + loopBytes > buf_end);
+    if (ptr < buf_end) {
+        u32 left = (u32)(buf_end - ptr);
+        u64a k1 = TEDDY_VBMI_CONF_MASK_VAR(left);
+        m512 p_mask1 = set_mask_m512(~k1);
+        m512 val_0 = loadu_maskz_m512(k1 | patch, ptr - overlap);
+        m512 r_0 = prep_conf_teddy_512vbmi_templ<NMSK>(&lo_mask, dup_mask, sl_msk, val_0);
+        r_0 = or512(r_0, p_mask1);
+        CONFIRM_TEDDY_512(r_0, 8, 0, VECTORING, ptr - overlap);
+    }
+
+    return HWLM_SUCCESS;
+}
+
+#define FDR_EXEC_TEDDY_FN fdr_exec_teddy_512vbmi_templ
+
+#elif defined(HAVE_AVX512) // AVX512 reinforced teddy
+
+/* both 512b versions use the same confirm teddy */
+
+template <int NMSK>
+static inline
+m512 shift_or_512_templ(const m512 *dup_mask, m512 lo, m512 hi) {
+    return or512(lshift128_m512(or512(pshufb_m512(dup_mask[(NMSK - 1) * 2], lo),
+                                pshufb_m512(dup_mask[(NMSK * 2) - 1], hi)),
+                                NMSK - 1), shift_or_512_templ<NMSK - 1>(dup_mask, lo, hi));
+}
+
+template <>
+m512 shift_or_512_templ<1>(const m512 *dup_mask, m512 lo, m512 hi){
+    return or512(pshufb_m512(dup_mask[0], lo), pshufb_m512(dup_mask[1], hi));
+}
+
+template <int NMSK>
+static really_inline
+m512 prep_conf_teddy_no_reinforcement_512_templ(const m512 *lo_mask,
+                                                const m512 *dup_mask,
+                                                const m512 val) {
+    m512 lo = and512(val, *lo_mask);
+    m512 hi = and512(rshift64_m512(val, 4), *lo_mask);
+    return shift_or_512_templ<NMSK>(dup_mask, lo, hi);
+}
+
+template <int NMSK>
+static really_inline
+m512 prep_conf_teddy_512_templ(const m512 *lo_mask, const m512 *dup_mask,
+                               const u8 *ptr, const u64a *r_msk_base,
+                               u32 *c_0, u32 *c_16, u32 *c_32, u32 *c_48) {
+    m512 lo = and512(load512(ptr), *lo_mask);
+    m512 hi = and512(rshift64_m512(load512(ptr), 4), *lo_mask);
+    *c_16 = *(ptr + 15);
+    *c_32 = *(ptr + 31);
+    *c_48 = *(ptr + 47);
+    m512 r_msk = set8x64(0ULL, r_msk_base[*c_48], 0ULL, r_msk_base[*c_32],
+                           0ULL, r_msk_base[*c_16], 0ULL, r_msk_base[*c_0]);
+    *c_0 = *(ptr + 63);
+    return or512(shift_or_512_templ<NMSK>(dup_mask, lo, hi), r_msk);
+}
+
+
+#define PREP_CONF_FN_512(ptr, n)                                                  \
+    prep_conf_teddy_512_templ<n>(&lo_mask, dup_mask, ptr, r_msk_base,             \
+                         &c_0, &c_16, &c_32, &c_48)
+
+template <int NMSK>
+hwlm_error_t fdr_exec_teddy_512_templ(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
+    const u8 *buf_end = a->buf + a->len;
+    const u8 *ptr = a->buf + a->start_offset;
+    u32 floodBackoff = FLOOD_BACKOFF_START;
+    const u8 *tryFloodDetect = a->firstFloodDetect;
+    u32 last_match = ones_u32;
+    const struct Teddy *teddy = (const struct Teddy *)fdr;
+    const size_t iterBytes = 128;
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+                 a->buf, a->len, a->start_offset);
+
+    const m128 *maskBase = getMaskBase(teddy);
+
+    m512 lo_mask = set1_64x8(0xf);
+    m512 dup_mask[NMSK * 2];
+
+    dup_mask[0] = set1_4x128(maskBase[0]);
+    dup_mask[1] = set1_4x128(maskBase[1]);
+    if constexpr (NMSK > 1){
+    dup_mask[2] = set1_4x128(maskBase[2]);
+    dup_mask[3] = set1_4x128(maskBase[3]);
+    }
+    if constexpr (NMSK > 2){
+    dup_mask[4] = set1_4x128(maskBase[4]);
+    dup_mask[5] = set1_4x128(maskBase[5]);
+    }
+    if constexpr (NMSK > 3){
+    dup_mask[6] = set1_4x128(maskBase[6]);
+    dup_mask[7] = set1_4x128(maskBase[7]);
+    }
+    const u32 *confBase = getConfBase(teddy);
+
+    const u64a *r_msk_base = getReinforcedMaskBase(teddy, NMSK);
+    u32 c_0 = 0x100;
+    u32 c_16 = 0x100;
+    u32 c_32 = 0x100;
+    u32 c_48 = 0x100;
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 64);
+    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+    if (ptr < mainStart) {
+        ptr = mainStart - 64;
+        m512 p_mask;
+        m512 val_0 = vectoredLoad512(&p_mask, ptr, a->start_offset,
+                                     a->buf, buf_end,
+                                     a->buf_history, a->len_history, NMSK);
+        m512 r_0 = prep_conf_teddy_no_reinforcement_512_templ<NMSK>(&lo_mask, dup_mask, val_0);
+        r_0 = or512(r_0, p_mask);
+        CONFIRM_TEDDY_512(r_0, 8, 0, VECTORING, ptr);
+        ptr += 64;
+    }
+
+    if (ptr + 64 <= buf_end) {
+        m512 r_0 = PREP_CONF_FN_512(ptr, NMSK);
+        CONFIRM_TEDDY_512(r_0, 8, 0, VECTORING, ptr);
+        ptr += 64;
+    }
+
+    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+        __builtin_prefetch(ptr + (iterBytes * 4));
+        CHECK_FLOOD;
+        m512 r_0 = PREP_CONF_FN_512(ptr, NMSK);
+        CONFIRM_TEDDY_512(r_0, 8, 0, NOT_CAUTIOUS, ptr);
+        m512 r_1 = PREP_CONF_FN_512(ptr + 64, NMSK);
+        CONFIRM_TEDDY_512(r_1, 8, 64, NOT_CAUTIOUS, ptr);
+    }
+
+    if (ptr + 64 <= buf_end) {
+        m512 r_0 = PREP_CONF_FN_512(ptr, NMSK);
+        CONFIRM_TEDDY_512(r_0, 8, 0, NOT_CAUTIOUS, ptr);
+        ptr += 64;
+    }
+
+    assert(ptr + 64 > buf_end);
+    if (ptr < buf_end) {
+        m512 p_mask;
+        m512 val_0 = vectoredLoad512(&p_mask, ptr, 0, ptr, buf_end,
+                                     a->buf_history, a->len_history, NMSK);
+        m512 r_0 = prep_conf_teddy_no_reinforcement_512_templ<NMSK>(&lo_mask, dup_mask,val_0);
+        r_0 = or512(r_0, p_mask);
+        CONFIRM_TEDDY_512(r_0, 8, 0, VECTORING, ptr);
+    }
+
+    return HWLM_SUCCESS;
+}
+
+
+#define FDR_EXEC_TEDDY_FN fdr_exec_teddy_512_templ
+
+/* #endif // AVX512 vs AVX512VBMI * back to the original fully exclusive logic */
+
+#elif defined(HAVE_AVX2) // not HAVE_AVX512 but HAVE_AVX2 reinforced teddy
+
+#ifdef ARCH_64_BIT
+
+hwlm_error_t confirm_teddy_64_256(m256 var, u8 bucket, u8 offset,
+                                  CautionReason reason, const u8 *ptr,
+                                  const struct FDR_Runtime_Args *a,
+                                  const u32* confBase, hwlm_group_t *control,
+                                  u32 *last_match) {
+    if (unlikely(diff256(var, ones256()))) {
+        m128 lo = movdq_lo(var);
+        m128 hi = movdq_hi(var);
+        u64a part1 = movq(lo);
+        u64a part2 = movq(rshiftbyte_m128(lo, 8));
+        u64a part3 = movq(hi);
+        u64a part4 = movq(rshiftbyte_m128(hi, 8));
+        CONF_CHUNK_64(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_64(part2, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_64(part3, bucket, offset + 16, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_64(part4, bucket, offset + 24, reason, ptr, confBase, a, control, last_match);
+    }
+    return HWLM_SUCCESS;
+}
+
+#define confirm_teddy_256_f confirm_teddy_64_256
+
+#else
+
+hwlm_error_t confirm_teddy_32_256(m256 var, u8 bucket, u8 offset,
+                                  CautionReason reason, const u8 *ptr,
+                                  const struct FDR_Runtime_Args *a,
+                                  const u32* confBase, hwlm_group_t *control,
+                                  u32 *last_match) {
+    if (unlikely(diff256(var, ones256()))) {
+        m128 lo = movdq_lo(var);
+        m128 hi = movdq_hi(var);
+        u32 part1 = movd(lo);
+        u32 part2 = movd(rshiftbyte_m128(lo, 4));
+        u32 part3 = movd(rshiftbyte_m128(lo, 8));
+        u32 part4 = movd(rshiftbyte_m128(lo, 12));
+        u32 part5 = movd(hi);
+        u32 part6 = movd(rshiftbyte_m128(hi, 4));
+        u32 part7 = movd(rshiftbyte_m128(hi, 8));
+        u32 part8 = movd(rshiftbyte_m128(hi, 12));
+        CONF_CHUNK_32(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part2, bucket, offset + 4, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part3, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part4, bucket, offset + 12, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part5, bucket, offset + 16, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part6, bucket, offset + 20, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part7, bucket, offset + 24, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part8, bucket, offset + 28, reason, ptr, confBase, a, control, last_match);
+    }
+    return HWLM_SUCCESS;
+}
+
+#define confirm_teddy_256_f confirm_teddy_32_256
+
+#endif
+
+#define CONFIRM_TEDDY_256(...) if(confirm_teddy_256_f(__VA_ARGS__, a, confBase, &control, &last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
+
+/*
+static really_inline
+m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const size_t start_offset,
+                       const u8 *lo, const u8 *hi,
+                       const u8 *buf_history, size_t len_history,
+                       const u32 nMasks) {
+    m128 p_mask128;
+    m256 ret = set1_2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi,
+                                          buf_history, len_history, nMasks));
+    *p_mask = set1_2x128(p_mask128);
+    return ret;
+}
+*/
+
+template <int NMSK>
+static inline
+m256 shift_or_256_templ(const m256 *dup_mask, m256 lo, m256 hi){
+    return or256(lshift128_m256(or256(pshufb_m256(dup_mask[(NMSK-1)*2], lo),
+                                pshufb_m256(dup_mask[(NMSK*2)-1], hi)),
+                                (NMSK-1)), shift_or_256_templ<NMSK-1>(dup_mask, lo, hi));
+}
+
+template<>
+m256 shift_or_256_templ<1>(const m256 *dup_mask, m256 lo, m256 hi){
+    return or256(pshufb_m256(dup_mask[0], lo), pshufb_m256(dup_mask[1], hi));
+}
+
+template <int NMSK>
+static really_inline
+m256 prep_conf_teddy_no_reinforcement_256_templ(const m256 *lo_mask,
+                                         const m256 *dup_mask,
+                                         const m256 val) {
+    m256 lo = and256(val, *lo_mask);
+    m256 hi = and256(rshift64_m256(val, 4), *lo_mask);
+    return shift_or_256_templ<NMSK>(dup_mask, lo, hi);
+}
+
+template <int NMSK>
+static really_inline
+m256 prep_conf_teddy_256_templ(const m256 *lo_mask, const m256 *dup_mask,
+                        const u8 *ptr, const u64a *r_msk_base,
+                        u32 *c_0, u32 *c_128) {
+    m256 lo = and256(load256(ptr), *lo_mask);
+    m256 hi = and256(rshift64_m256(load256(ptr), 4), *lo_mask);
+    *c_128 = *(ptr + 15);
+    m256 r_msk = set4x64(0ULL, r_msk_base[*c_128], 0ULL, r_msk_base[*c_0]);
+    *c_0 = *(ptr + 31);
+    return or256(shift_or_256_templ<NMSK>(dup_mask, lo, hi), r_msk);
+}
+
+#define PREP_CONF_FN_256_NO_REINFORCEMENT(val, n)                                 \
+    prep_conf_teddy_no_reinforcement_256_templ<n>(&lo_mask, dup_mask, val)
+
+#define PREP_CONF_FN_256(ptr, n)                                                  \
+    prep_conf_teddy_256_templ<n>(&lo_mask, dup_mask, ptr, r_msk_base, &c_0, &c_128)
+
+template <int NMSK>
+hwlm_error_t fdr_exec_teddy_256_templ(const struct FDR *fdr,
+                                  const struct FDR_Runtime_Args *a,
+                                  hwlm_group_t control) {
+    const u8 *buf_end = a->buf + a->len;
+    const u8 *ptr = a->buf + a->start_offset;
+    u32 floodBackoff = FLOOD_BACKOFF_START;
+    const u8 *tryFloodDetect = a->firstFloodDetect;
+    u32 last_match = ones_u32;
+    const struct Teddy *teddy = (const struct Teddy *)fdr;
+    const size_t iterBytes = 64;
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+                 a->buf, a->len, a->start_offset);
+
+    const m128 *maskBase = getMaskBase(teddy);
+    //PREPARE_MASKS_256;
+
+    m256 lo_mask = set1_32x8(0xf);
+    m256 dup_mask[NMSK * 2];
+    dup_mask[0] = set1_2x128(maskBase[0]);
+    dup_mask[1] = set1_2x128(maskBase[1]);
+    if constexpr (NMSK > 1){
+    dup_mask[2] = set1_2x128(maskBase[2]);
+    dup_mask[3] = set1_2x128(maskBase[3]);
+    }
+    if constexpr (NMSK > 2){
+    dup_mask[4] = set1_2x128(maskBase[4]);
+    dup_mask[5] = set1_2x128(maskBase[5]);
+    }
+    if constexpr (NMSK > 3){
+    dup_mask[6] = set1_2x128(maskBase[6]);
+    dup_mask[7] = set1_2x128(maskBase[7]);
+    }
+    const u32 *confBase = getConfBase(teddy);
+
+    const u64a *r_msk_base = getReinforcedMaskBase(teddy, NMSK);
+    u32 c_0 = 0x100;
+    u32 c_128 = 0x100;
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 32);
+    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+    if (ptr < mainStart) {
+        ptr = mainStart - 32;
+        m256 p_mask;
+        m256 val_0 = vectoredLoad256(&p_mask, ptr, a->start_offset,
+                                     a->buf, buf_end,
+                                     a->buf_history, a->len_history, NMSK);
+        m256 r_0 = PREP_CONF_FN_256_NO_REINFORCEMENT(val_0, NMSK);
+        r_0 = or256(r_0, p_mask);
+        CONFIRM_TEDDY_256(r_0, 8, 0, VECTORING, ptr);
+        ptr += 32;
+    }
+
+    if (ptr + 32 <= buf_end) {
+        m256 r_0 = PREP_CONF_FN_256(ptr, NMSK);
+        CONFIRM_TEDDY_256(r_0, 8, 0, VECTORING, ptr);
+        ptr += 32;
+    }
+
+    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+        __builtin_prefetch(ptr + (iterBytes * 4));
+        CHECK_FLOOD;
+        m256 r_0 = PREP_CONF_FN_256(ptr, NMSK);
+        CONFIRM_TEDDY_256(r_0, 8, 0, NOT_CAUTIOUS, ptr);
+        m256 r_1 = PREP_CONF_FN_256(ptr + 32, NMSK);
+        CONFIRM_TEDDY_256(r_1, 8, 32, NOT_CAUTIOUS, ptr);
+    }
+
+    if (ptr + 32 <= buf_end) {
+        m256 r_0 = PREP_CONF_FN_256(ptr, NMSK);
+        CONFIRM_TEDDY_256(r_0, 8, 0, NOT_CAUTIOUS, ptr);
+        ptr += 32;
+    }
+
+    assert(ptr + 32 > buf_end);
+    if (ptr < buf_end) {
+        m256 p_mask;
+        m256 val_0 = vectoredLoad256(&p_mask, ptr, 0, ptr, buf_end,
+                                     a->buf_history, a->len_history, NMSK);
+        m256 r_0 = PREP_CONF_FN_256_NO_REINFORCEMENT(val_0, NMSK);
+        r_0 = or256(r_0, p_mask);
+        CONFIRM_TEDDY_256(r_0, 8, 0, VECTORING, ptr);
+    }
+
+    return HWLM_SUCCESS;
+}
+
+#define FDR_EXEC_TEDDY_FN fdr_exec_teddy_256_templ
+
+#else // not defined HAVE_AVX2
+
+#ifdef ARCH_64_BIT
+static really_inline
+hwlm_error_t confirm_teddy_64_128(m128 var, u8 bucket, u8 offset,
+                                  CautionReason reason, const u8 *ptr,
+                                  const struct FDR_Runtime_Args *a,
+                                  const u32* confBase, hwlm_group_t *control,
+                                  u32 *last_match) {
+    if (unlikely(diff128(var, ones128()))) {
+        u64a lo = 0;
+        u64a hi = 0;
+        u64a __attribute__((aligned(16))) vec[2];
+        store128(vec, var);
+        lo = vec[0];
+        hi = vec[1];
+        CONF_CHUNK_64(lo, bucket, offset, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_64(hi, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
+    }
+    return HWLM_SUCCESS;
+}
+
+#define confirm_teddy_128_f confirm_teddy_64_128
+
+#else // 32/64
+
+static really_inline
+hwlm_error_t confirm_teddy_32_128(m128 var, u8 bucket, u8 offset,
+                                  CautionReason reason, const u8 *ptr,
+                                  const struct FDR_Runtime_Args *a,
+                                  const u32* confBase, hwlm_group_t *control,
+                                  u32 *last_match) {
+    if (unlikely(diff128(var, ones128()))) {
+        u32 part1 = movd(var);
+        u32 part2 = movd(rshiftbyte_m128(var, 4));
+        u32 part3 = movd(rshiftbyte_m128(var, 8));
+        u32 part4 = movd(rshiftbyte_m128(var, 12));
+        CONF_CHUNK_32(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part2, bucket, offset + 4, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part3, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
+        CONF_CHUNK_32(part4, bucket, offset + 12, reason, ptr, confBase, a, control, last_match);
+    }
+    return HWLM_SUCCESS;
+}
+#define confirm_teddy_128_f confirm_teddy_32_128
+
+#endif  // 32/64
+
+
+#define CONFIRM_TEDDY_128(...) if(confirm_teddy_128_f(__VA_ARGS__, a, confBase, &control, &last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
+
+template <int NMSK>
+static really_inline
+m128 prep_conf_teddy_128_templ(const m128 *maskBase, m128 val) {
+    m128 mask = set1_16x8(0xf);
+    m128 lo = and128(val, mask);
+    m128 hi = and128(rshift64_m128(val, 4), mask);
+    m128 r1 = or128(pshufb_m128(maskBase[0 * 2], lo),
+                             pshufb_m128(maskBase[0 * 2 + 1], hi));
+    if constexpr (NMSK == 1) return r1;
+    m128 res_1 = or128(pshufb_m128(maskBase[1 * 2], lo),
+                       pshufb_m128(maskBase[1 * 2 + 1], hi));
+
+    m128 old_1 = zeroes128();
+    m128 res_shifted_1 = palignr(res_1, old_1, 16 - 1);
+    m128 r2 = or128(r1, res_shifted_1);
+    if constexpr (NMSK == 2) return r2;
+    m128 res_2 = or128(pshufb_m128(maskBase[2 * 2], lo),
+                       pshufb_m128(maskBase[2 * 2 + 1], hi));
+    m128 res_shifted_2 = palignr(res_2, old_1, 16 - 2);
+    m128 r3 = or128(r2, res_shifted_2);
+    if constexpr (NMSK == 3) return r3;
+    m128 res_3 = or128(pshufb_m128(maskBase[3 * 2], lo),
+                       pshufb_m128(maskBase[3 * 2 + 1], hi));
+    m128 res_shifted_3 = palignr(res_3, old_1, 16 - 3);
+    return or128(r3, res_shifted_3);
+}
+
+template <int NMSK>
+hwlm_error_t fdr_exec_teddy_128_templ(const struct FDR *fdr,
+                             const struct FDR_Runtime_Args *a,
+                             hwlm_group_t control) {
+    const u8 *buf_end = a->buf + a->len;
+    const u8 *ptr = a->buf + a->start_offset;
+    u32 floodBackoff = FLOOD_BACKOFF_START;
+    const u8 *tryFloodDetect = a->firstFloodDetect;
+    u32 last_match = ones_u32;
+    const struct Teddy *teddy = reinterpret_cast<const struct Teddy *>(fdr);
+    const size_t iterBytes = 32;
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+                 a->buf, a->len, a->start_offset);
+
+    const m128 *maskBase = getMaskBase(teddy);
+    const u32 *confBase = getConfBase(teddy);
+
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
+    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+    if (ptr < mainStart) {
+        ptr = mainStart - 16;
+        m128 p_mask;
+        m128 val_0 = vectoredLoad128(&p_mask, ptr, a->start_offset,
+                                     a->buf, buf_end,
+                                     a->buf_history, a->len_history, NMSK);
+        m128 r_0 = prep_conf_teddy_128_templ<NMSK>(maskBase, val_0);
+        r_0 = or128(r_0, p_mask);
+        CONFIRM_TEDDY_128(r_0, 8, 0, VECTORING, ptr);
+        ptr += 16;
+    }
+
+    if (ptr + 16 <= buf_end) {
+        m128 r_0 = prep_conf_teddy_128_templ<NMSK>(maskBase, load128(ptr));
+        CONFIRM_TEDDY_128(r_0, 8, 0, VECTORING, ptr);
+        ptr += 16;
+    }
+
+    for (; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+        __builtin_prefetch(ptr + (iterBytes * 4));
+        CHECK_FLOOD;
+        m128 r_0 = prep_conf_teddy_128_templ<NMSK>(maskBase, load128(ptr));
+        CONFIRM_TEDDY_128(r_0, 8, 0, NOT_CAUTIOUS, ptr);
+        m128 r_1 = prep_conf_teddy_128_templ<NMSK>(maskBase, load128(ptr + 16));
+        CONFIRM_TEDDY_128(r_1, 8, 16, NOT_CAUTIOUS, ptr);
+    }
+
+    if (ptr + 16 <= buf_end) {
+        m128 r_0 = prep_conf_teddy_128_templ<NMSK>(maskBase, load128(ptr));
+        CONFIRM_TEDDY_128(r_0, 8, 0, NOT_CAUTIOUS, ptr);
+        ptr += 16;
+    }
+
+    assert(ptr + 16 > buf_end);
+    if (ptr < buf_end) {
+        m128 p_mask;
+        m128 val_0 = vectoredLoad128(&p_mask, ptr, 0, ptr, buf_end,
+                                     a->buf_history, a->len_history, NMSK);
+        m128 r_0 = prep_conf_teddy_128_templ<NMSK>(maskBase, val_0);
+        r_0 = or128(r_0, p_mask);
+        CONFIRM_TEDDY_128(r_0, 8, 0, VECTORING, ptr);
+    }
+
+    return HWLM_SUCCESS;
+}
+
+#define FDR_EXEC_TEDDY_FN fdr_exec_teddy_128_templ
+
+
+#endif // HAVE_AVX2 HAVE_AVX512
+
+
+
+extern "C" {
+
+hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
+                                  const struct FDR_Runtime_Args *a,
+                                  hwlm_group_t control) {
+    return FDR_EXEC_TEDDY_FN<1>(fdr, a, control);
+}
+
+hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
+    return FDR_EXEC_TEDDY_FN<1>(fdr, a, control);
+}
+
+hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr,
+                                  const struct FDR_Runtime_Args *a,
+                                  hwlm_group_t control) {
+    return FDR_EXEC_TEDDY_FN<2>(fdr, a, control);
+}
+
+hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
+    return FDR_EXEC_TEDDY_FN<2>(fdr, a, control);
+}
+
+hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr,
+                                  const struct FDR_Runtime_Args *a,
+                                  hwlm_group_t control) {
+    return FDR_EXEC_TEDDY_FN<3>(fdr, a, control);
+}
+
+hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
+    return FDR_EXEC_TEDDY_FN<3>(fdr, a, control);
+}
+
+hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr,
+                                  const struct FDR_Runtime_Args *a,
+                                  hwlm_group_t control) {
+    return FDR_EXEC_TEDDY_FN<4>(fdr, a, control);
+}
+
+hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
+    return FDR_EXEC_TEDDY_FN<4>(fdr, a, control);
+}
+
+} // extern
+
--- a/src/fdr/teddy.h
+++ b/src/fdr/teddy.h
@ -1,5 +1,6 @@
 /*
 * Copyright (c) 2016-2017, Intel Corporation
+ * Copyright (c) 2024, VectorCamp PC
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -39,6 +40,10 @@
 struct FDR; // forward declaration from fdr_internal.h
 struct FDR_Runtime_Args;

+#ifdef __cplusplus
+extern "C" {
+#endif
+
 hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr,
                                  const struct FDR_Runtime_Args *a,
                                  hwlm_group_t control);
@ -106,5 +111,8 @@ hwlm_error_t fdr_exec_fat_teddy_msks4_pck(const struct FDR *fdr,
                                          hwlm_group_t control);

 #endif /* HAVE_AVX2 */
+#ifdef __cplusplus
+}
+#endif

 #endif /* TEDDY_H_ */
--- a/src/fdr/teddy_avx2.c
+++ b/src/fdr/teddy_avx2.c
@ -1,709 +0,0 @@
-/*
- * Copyright (c) 2016-2020, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-/** \file
- * \brief Teddy literal matcher: AVX2 engine runtime.
- */
-
-#include "fdr_internal.h"
-#include "flood_runtime.h"
-#include "teddy.h"
-#include "teddy_internal.h"
-#include "teddy_runtime_common.h"
-#include "util/arch.h"
-#include "util/simd_utils.h"
-
-#if defined(HAVE_AVX2)
-
-const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64] = {
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff},
-    {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
-};
-
-#if defined(HAVE_AVX512VBMI) // VBMI strong fat teddy
-
-#define CONF_FAT_CHUNK_64(chunk, bucket, off, reason, pt, conf_fn)          \
-do {                                                                        \
-    if (unlikely(chunk != ones_u64a)) {                                     \
-        chunk = ~chunk;                                                     \
-        conf_fn(&chunk, bucket, off, confBase, reason, a, pt,               \
-                &control, &last_match);                                     \
-        CHECK_HWLM_TERMINATE_MATCHING;                                      \
-    }                                                                       \
-} while(0)
-
-#define CONF_FAT_CHUNK_32(chunk, bucket, off, reason, pt, conf_fn)          \
-do {                                                                        \
-    if (unlikely(chunk != ones_u32)) {                                      \
-        chunk = ~chunk;                                                     \
-        conf_fn(&chunk, bucket, off, confBase, reason, a, pt,               \
-                &control, &last_match);                                     \
-        CHECK_HWLM_TERMINATE_MATCHING;                                      \
-    }                                                                       \
-} while(0)
-
-static really_inline
-const m512 *getDupMaskBase(const struct Teddy *teddy, u8 numMask) {
-    return (const m512 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy))
-                          + ROUNDUP_CL(2 * numMask * sizeof(m256)));
-}
-
-#else
-
-#define CONF_FAT_CHUNK_64(chunk, bucket, off, reason, conf_fn)              \
-do {                                                                        \
-    if (unlikely(chunk != ones_u64a)) {                                     \
-        chunk = ~chunk;                                                     \
-        conf_fn(&chunk, bucket, off, confBase, reason, a, ptr,              \
-                &control, &last_match);                                     \
-        CHECK_HWLM_TERMINATE_MATCHING;                                      \
-    }                                                                       \
-} while(0)
-
-#define CONF_FAT_CHUNK_32(chunk, bucket, off, reason, conf_fn)              \
-do {                                                                        \
-    if (unlikely(chunk != ones_u32)) {                                      \
-        chunk = ~chunk;                                                     \
-        conf_fn(&chunk, bucket, off, confBase, reason, a, ptr,              \
-                &control, &last_match);                                     \
-        CHECK_HWLM_TERMINATE_MATCHING;                                      \
-    }                                                                       \
-} while(0)
-
-static really_inline
-const m256 *getMaskBase_fat(const struct Teddy *teddy) {
-    return (const m256 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
-}
-
-#endif
-
-#if defined(HAVE_AVX512VBMI) // VBMI strong fat teddy
-
-const u8 ALIGN_AVX_DIRECTIVE p_mask_interleave[64] = {
-    0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
-    8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
-    16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55,
-    24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63
-};
-
-#ifdef ARCH_64_BIT
-#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, pt, conf_fn)         \
-do {                                                                        \
-    if (unlikely(diff512(var, ones512()))) {                                \
-        m512 msk_interleave = load512(p_mask_interleave);                   \
-        m512 r = vpermb512(msk_interleave, var);                            \
-        m128 r0 = extract128from512(r, 0);                                  \
-        m128 r1 = extract128from512(r, 1);                                  \
-        m128 r2 = extract128from512(r, 2);                                  \
-        m128 r3 = extract128from512(r, 3);                                  \
-        u64a part1 = movq(r0);                                              \
-        u64a part2 = extract64from128(r0, 1);                               \
-        u64a part3 = movq(r1);                                              \
-        u64a part4 = extract64from128(r1, 1);                               \
-        u64a part5 = movq(r2);                                              \
-        u64a part6 = extract64from128(r2, 1);                               \
-        u64a part7 = movq(r3);                                              \
-        u64a part8 = extract64from128(r3, 1);                               \
-        CONF_FAT_CHUNK_64(part1, bucket, offset, reason, pt, conf_fn);      \
-        CONF_FAT_CHUNK_64(part2, bucket, offset + 4, reason, pt, conf_fn);  \
-        CONF_FAT_CHUNK_64(part3, bucket, offset + 8, reason, pt, conf_fn);  \
-        CONF_FAT_CHUNK_64(part4, bucket, offset + 12, reason, pt, conf_fn); \
-        CONF_FAT_CHUNK_64(part5, bucket, offset + 16, reason, pt, conf_fn); \
-        CONF_FAT_CHUNK_64(part6, bucket, offset + 20, reason, pt, conf_fn); \
-        CONF_FAT_CHUNK_64(part7, bucket, offset + 24, reason, pt, conf_fn); \
-        CONF_FAT_CHUNK_64(part8, bucket, offset + 28, reason, pt, conf_fn); \
-    }                                                                       \
-} while(0)
-#else
-#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, pt, conf_fn)         \
-do {                                                                        \
-    if (unlikely(diff512(var, ones512()))) {                                \
-        m512 msk_interleave = load512(p_mask_interleave);                   \
-        m512 r = vpermb512(msk_interleave, var);                            \
-        m128 r0 = extract128from512(r, 0);                                  \
-        m128 r1 = extract128from512(r, 1);                                  \
-        m128 r2 = extract128from512(r, 2);                                  \
-        m128 r3 = extract128from512(r, 3);                                  \
-        u32 part1 = movd(r0);                                               \
-        u32 part2 = extract32from128(r0, 1);                                \
-        u32 part3 = extract32from128(r0, 2);                                \
-        u32 part4 = extract32from128(r0, 3);                                \
-        u32 part5 = movd(r1);                                               \
-        u32 part6 = extract32from128(r1, 1);                                \
-        u32 part7 = extract32from128(r1, 2);                                \
-        u32 part8 = extract32from128(r1, 3);                                \
-        u32 part9 = movd(r2);                                               \
-        u32 part10 = extract32from128(r2, 1);                               \
-        u32 part11 = extract32from128(r2, 2);                               \
-        u32 part12 = extract32from128(r2, 3);                               \
-        u32 part13 = movd(r3);                                              \
-        u32 part14 = extract32from128(r3, 1);                               \
-        u32 part15 = extract32from128(r3, 2);                               \
-        u32 part16 = extract32from128(r3, 3);                               \
-        CONF_FAT_CHUNK_32(part1, bucket, offset, reason, pt, conf_fn);      \
-        CONF_FAT_CHUNK_32(part2, bucket, offset + 2, reason, pt, conf_fn);  \
-        CONF_FAT_CHUNK_32(part3, bucket, offset + 4, reason, pt, conf_fn);  \
-        CONF_FAT_CHUNK_32(part4, bucket, offset + 6, reason, pt, conf_fn);  \
-        CONF_FAT_CHUNK_32(part5, bucket, offset + 8, reason, pt, conf_fn);  \
-        CONF_FAT_CHUNK_32(part6, bucket, offset + 10, reason, pt, conf_fn); \
-        CONF_FAT_CHUNK_32(part7, bucket, offset + 12, reason, pt, conf_fn); \
-        CONF_FAT_CHUNK_32(part8, bucket, offset + 14, reason, pt, conf_fn); \
-        CONF_FAT_CHUNK_32(part9, bucket, offset + 16, reason, pt, conf_fn); \
-        CONF_FAT_CHUNK_32(part10, bucket, offset + 18, reason, pt, conf_fn);\
-        CONF_FAT_CHUNK_32(part11, bucket, offset + 20, reason, pt, conf_fn);\
-        CONF_FAT_CHUNK_32(part12, bucket, offset + 22, reason, pt, conf_fn);\
-        CONF_FAT_CHUNK_32(part13, bucket, offset + 24, reason, pt, conf_fn);\
-        CONF_FAT_CHUNK_32(part14, bucket, offset + 26, reason, pt, conf_fn);\
-        CONF_FAT_CHUNK_32(part15, bucket, offset + 28, reason, pt, conf_fn);\
-        CONF_FAT_CHUNK_32(part16, bucket, offset + 30, reason, pt, conf_fn);\
-    }                                                                       \
-} while(0)
-#endif
-
-#define PREP_FAT_SHUF_MASK                                                  \
-    m512 lo = and512(val, *lo_mask);                                        \
-    m512 hi = and512(rshift64_m512(val, 4), *lo_mask)
-
-#define FAT_TEDDY_VBMI_PSHUFB_OR_M1                          \
-    m512 shuf_or_b0 = or512(pshufb_m512(dup_mask[0], lo),    \
-                            pshufb_m512(dup_mask[1], hi));
-
-#define FAT_TEDDY_VBMI_PSHUFB_OR_M2                          \
-    FAT_TEDDY_VBMI_PSHUFB_OR_M1                              \
-    m512 shuf_or_b1 = or512(pshufb_m512(dup_mask[2], lo),    \
-                            pshufb_m512(dup_mask[3], hi));
-
-#define FAT_TEDDY_VBMI_PSHUFB_OR_M3                          \
-    FAT_TEDDY_VBMI_PSHUFB_OR_M2                              \
-    m512 shuf_or_b2 = or512(pshufb_m512(dup_mask[4], lo),    \
-                            pshufb_m512(dup_mask[5], hi));
-
-#define FAT_TEDDY_VBMI_PSHUFB_OR_M4                          \
-    FAT_TEDDY_VBMI_PSHUFB_OR_M3                              \
-    m512 shuf_or_b3 = or512(pshufb_m512(dup_mask[6], lo),    \
-                            pshufb_m512(dup_mask[7], hi));
-
-#define FAT_TEDDY_VBMI_SL1_MASK   0xfffffffefffffffeULL
-#define FAT_TEDDY_VBMI_SL2_MASK   0xfffffffcfffffffcULL
-#define FAT_TEDDY_VBMI_SL3_MASK   0xfffffff8fffffff8ULL
-
-#define FAT_TEDDY_VBMI_SHIFT_M1
-
-#define FAT_TEDDY_VBMI_SHIFT_M2                      \
-    FAT_TEDDY_VBMI_SHIFT_M1                          \
-    m512 sl1 = maskz_vpermb512(FAT_TEDDY_VBMI_SL1_MASK, sl_msk[0], shuf_or_b1);
-
-#define FAT_TEDDY_VBMI_SHIFT_M3                      \
-    FAT_TEDDY_VBMI_SHIFT_M2                          \
-    m512 sl2 = maskz_vpermb512(FAT_TEDDY_VBMI_SL2_MASK, sl_msk[1], shuf_or_b2);
-
-#define FAT_TEDDY_VBMI_SHIFT_M4                      \
-    FAT_TEDDY_VBMI_SHIFT_M3                          \
-    m512 sl3 = maskz_vpermb512(FAT_TEDDY_VBMI_SL3_MASK, sl_msk[2], shuf_or_b3);
-
-#define FAT_SHIFT_OR_M1            \
-    shuf_or_b0
-
-#define FAT_SHIFT_OR_M2            \
-    or512(sl1, FAT_SHIFT_OR_M1)
-
-#define FAT_SHIFT_OR_M3            \
-    or512(sl2, FAT_SHIFT_OR_M2)
-
-#define FAT_SHIFT_OR_M4            \
-    or512(sl3, FAT_SHIFT_OR_M3)
-
-static really_inline
-m512 prep_conf_fat_teddy_m1(const m512 *lo_mask, const m512 *dup_mask,
-                            UNUSED const m512 *sl_msk, const m512 val) {
-    PREP_FAT_SHUF_MASK;
-    FAT_TEDDY_VBMI_PSHUFB_OR_M1;
-    FAT_TEDDY_VBMI_SHIFT_M1;
-    return FAT_SHIFT_OR_M1;
-}
-
-static really_inline
-m512 prep_conf_fat_teddy_m2(const m512 *lo_mask, const m512 *dup_mask,
-                            const m512 *sl_msk, const m512 val) {
-    PREP_FAT_SHUF_MASK;
-    FAT_TEDDY_VBMI_PSHUFB_OR_M2;
-    FAT_TEDDY_VBMI_SHIFT_M2;
-    return FAT_SHIFT_OR_M2;
-}
-
-static really_inline
-m512 prep_conf_fat_teddy_m3(const m512 *lo_mask, const m512 *dup_mask,
-                            const m512 *sl_msk, const m512 val) {
-    PREP_FAT_SHUF_MASK;
-    FAT_TEDDY_VBMI_PSHUFB_OR_M3;
-    FAT_TEDDY_VBMI_SHIFT_M3;
-    return FAT_SHIFT_OR_M3;
-}
-
-static really_inline
-m512 prep_conf_fat_teddy_m4(const m512 *lo_mask, const m512 *dup_mask,
-                            const m512 *sl_msk, const m512 val) {
-    PREP_FAT_SHUF_MASK;
-    FAT_TEDDY_VBMI_PSHUFB_OR_M4;
-    FAT_TEDDY_VBMI_SHIFT_M4;
-    return FAT_SHIFT_OR_M4;
-}
-
-#define PREP_CONF_FAT_FN(val, n)    \
-    prep_conf_fat_teddy_m##n(&lo_mask, dup_mask, sl_msk, val)
-
-#define FAT_TEDDY_VBMI_SL1_POS    15
-#define FAT_TEDDY_VBMI_SL2_POS    14
-#define FAT_TEDDY_VBMI_SL3_POS    13
-
-#define FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M1
-
-#define FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M2    \
-    FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M1        \
-    sl_msk[0] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL1_POS);
-
-#define FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M3    \
-    FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M2        \
-    sl_msk[1] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL2_POS);
-
-#define FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M4    \
-    FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M3        \
-    sl_msk[2] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL3_POS);
-
-/*
- * In FAT teddy, it needs 2 bytes to represent result of each position,
- * so each nibble's(for example, lo nibble of last byte) FAT teddy mask
- * has 16x2 bytes:
- *   |----------------------------------|----------------------------------|
- *   16bytes (bucket 0..7 in each byte) 16bytes (bucket 8..15 in each byte)
- *                     A                                  B
- * at runtime FAT teddy reads 16 bytes once and duplicate them to 32 bytes:
- *   |----------------------------------|----------------------------------|
- *   16bytes input data (lo nibbles)    16bytes duplicated data (lo nibbles)
- *                     X                                  X
- * then do pshufb_m256(AB, XX).
- *
- * In AVX512 reinforced FAT teddy, it reads 32 bytes once and duplicate them
- * to 64 bytes:
- *   |----------------|----------------|----------------|----------------|
- *            X                Y                X                Y
- * in this case we need DUP_FAT_MASK to construct AABB:
- *   |----------------|----------------|----------------|----------------|
- *            A                A                B                B
- * then do pshufb_m512(AABB, XYXY).
- */
-
-#define PREPARE_FAT_MASKS(n)                                                  \
-    m512 lo_mask = set1_64x8(0xf);                                              \
-    m512 sl_msk[n - 1];                                                       \
-    FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M##n
-
-#define FAT_TEDDY_VBMI_CONF_MASK_HEAD   (0xffffffffULL >> n_sh)
-#define FAT_TEDDY_VBMI_CONF_MASK_FULL   ((0xffffffffULL << n_sh) & 0xffffffffULL)
-#define FAT_TEDDY_VBMI_CONF_MASK_VAR(n) (0xffffffffULL >> (32 - n) << overlap)
-#define FAT_TEDDY_VBMI_LOAD_MASK_PATCH  (0xffffffffULL >> (32 - n_sh))
-
-#define FDR_EXEC_FAT_TEDDY(fdr, a, control, n_msk, conf_fn)                   \
-do {                                                                          \
-    const u8 *buf_end = a->buf + a->len;                                      \
-    const u8 *ptr = a->buf + a->start_offset;                                 \
-    u32 floodBackoff = FLOOD_BACKOFF_START;                                   \
-    const u8 *tryFloodDetect = a->firstFloodDetect;                           \
-    u32 last_match = ones_u32;                                                \
-    const struct Teddy *teddy = (const struct Teddy *)fdr;                    \
-    const size_t iterBytes = 32;                                              \
-    u32 n_sh = n_msk - 1;                                                     \
-    const size_t loopBytes = 32 - n_sh;                                       \
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",                 \
-                 a->buf, a->len, a->start_offset);                            \
-                                                                              \
-    const m512 *dup_mask = getDupMaskBase(teddy, n_msk);                      \
-    PREPARE_FAT_MASKS(n_msk);                                                 \
-    const u32 *confBase = getConfBase(teddy);                                 \
-                                                                              \
-    u64a k = FAT_TEDDY_VBMI_CONF_MASK_FULL;                                   \
-    m512 p_mask = set_mask_m512(~((k << 32) | k));                            \
-    u32 overlap = 0;                                                          \
-    u64a patch = 0;                                                           \
-    if (likely(ptr + loopBytes <= buf_end)) {                                 \
-        u64a k0 = FAT_TEDDY_VBMI_CONF_MASK_HEAD;                              \
-        m512 p_mask0 = set_mask_m512(~((k0 << 32) | k0));                     \
-        m512 r_0 = PREP_CONF_FAT_FN(set2x256(loadu256(ptr)), n_msk);          \
-        r_0 = or512(r_0, p_mask0);                                            \
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, ptr, conf_fn);               \
-        ptr += loopBytes;                                                     \
-        overlap = n_sh;                                                       \
-        patch = FAT_TEDDY_VBMI_LOAD_MASK_PATCH;                               \
-    }                                                                         \
-                                                                              \
-    for (; ptr + loopBytes <= buf_end; ptr += loopBytes) {                    \
-        CHECK_FLOOD;                                                          \
-        m512 r_0 = PREP_CONF_FAT_FN(set2x256(loadu256(ptr - n_sh)), n_msk);   \
-        r_0 = or512(r_0, p_mask);                                             \
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, ptr - n_sh, conf_fn);     \
-    }                                                                         \
-                                                                              \
-    assert(ptr + loopBytes > buf_end);                                        \
-    if (ptr < buf_end) {                                                      \
-        u32 left = (u32)(buf_end - ptr);                                      \
-        u64a k1 = FAT_TEDDY_VBMI_CONF_MASK_VAR(left);                         \
-        m512 p_mask1 = set_mask_m512(~((k1 << 32) | k1));                     \
-        m512 val_0 = set2x256(loadu_maskz_m256(k1 | patch, ptr - overlap));   \
-        m512 r_0 = PREP_CONF_FAT_FN(val_0, n_msk);                            \
-        r_0 = or512(r_0, p_mask1);                                            \
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, ptr - overlap, conf_fn);     \
-    }                                                                         \
-                                                                              \
-    return HWLM_SUCCESS;                                                      \
-} while(0)
-
-#else // !HAVE_AVX512VBMI, AVX2 normal fat teddy
-
-#ifdef ARCH_64_BIT
-#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn)             \
-do {                                                                        \
-    if (unlikely(diff256(var, ones256()))) {                                \
-        m256 swap = swap128in256(var);                                      \
-        m256 r = interleave256lo(var, swap);                                \
-        u64a part1 = extractlow64from256(r);                                \
-        u64a part2 = extract64from256(r, 1);                                \
-        r = interleave256hi(var, swap);                                     \
-        u64a part3 = extractlow64from256(r);                                \
-        u64a part4 = extract64from256(r, 1);                                \
-        CONF_FAT_CHUNK_64(part1, bucket, offset, reason, conf_fn);          \
-        CONF_FAT_CHUNK_64(part2, bucket, offset + 4, reason, conf_fn);      \
-        CONF_FAT_CHUNK_64(part3, bucket, offset + 8, reason, conf_fn);      \
-        CONF_FAT_CHUNK_64(part4, bucket, offset + 12, reason, conf_fn);     \
-    }                                                                       \
-} while(0)
-#else
-#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn)             \
-do {                                                                        \
-    if (unlikely(diff256(var, ones256()))) {                                \
-        m256 swap = swap128in256(var);                                      \
-        m256 r = interleave256lo(var, swap);                                \
-        u32 part1 = extractlow32from256(r);                                 \
-        u32 part2 = extract32from256(r, 1);                                 \
-        u32 part3 = extract32from256(r, 2);                                 \
-        u32 part4 = extract32from256(r, 3);                                 \
-        r = interleave256hi(var, swap);                                     \
-        u32 part5 = extractlow32from256(r);                                 \
-        u32 part6 = extract32from256(r, 1);                                 \
-        u32 part7 = extract32from256(r, 2);                                 \
-        u32 part8 = extract32from256(r, 3);                                 \
-        CONF_FAT_CHUNK_32(part1, bucket, offset, reason, conf_fn);          \
-        CONF_FAT_CHUNK_32(part2, bucket, offset + 2, reason, conf_fn);      \
-        CONF_FAT_CHUNK_32(part3, bucket, offset + 4, reason, conf_fn);      \
-        CONF_FAT_CHUNK_32(part4, bucket, offset + 6, reason, conf_fn);      \
-        CONF_FAT_CHUNK_32(part5, bucket, offset + 8, reason, conf_fn);      \
-        CONF_FAT_CHUNK_32(part6, bucket, offset + 10, reason, conf_fn);     \
-        CONF_FAT_CHUNK_32(part7, bucket, offset + 12, reason, conf_fn);     \
-        CONF_FAT_CHUNK_32(part8, bucket, offset + 14, reason, conf_fn);     \
-    }                                                                       \
-} while(0)
-#endif
-
-static really_inline
-m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const size_t start_offset,
-                       const u8 *lo, const u8 *hi,
-                       const u8 *buf_history, size_t len_history,
-                       const u32 nMasks) {
-    m128 p_mask128;
-    m256 ret = set1_2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi,
-                                        buf_history, len_history, nMasks));
-    *p_mask = set1_2x128(p_mask128);
-    return ret;
-}
-
-static really_inline
-m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 val) {
-    m256 mask = set1_32x8(0xf);
-    m256 lo = and256(val, mask);
-    m256 hi = and256(rshift64_m256(val, 4), mask);
-    return or256(pshufb_m256(maskBase[0 * 2], lo),
-                 pshufb_m256(maskBase[0 * 2 + 1], hi));
-}
-
-static really_inline
-m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 val) {
-    m256 mask = set1_32x8(0xf);
-    m256 lo = and256(val, mask);
-    m256 hi = and256(rshift64_m256(val, 4), mask);
-    m256 r = prep_conf_fat_teddy_m1(maskBase, val);
-
-    m256 res_1 = or256(pshufb_m256(maskBase[1 * 2], lo),
-                       pshufb_m256(maskBase[1 * 2 + 1], hi));
-    m256 res_shifted_1 = vpalignr(res_1, *old_1, 16 - 1);
-    *old_1 = res_1;
-    return or256(r, res_shifted_1);
-}
-
-static really_inline
-m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2,
-                            m256 val) {
-    m256 mask = set1_32x8(0xf);
-    m256 lo = and256(val, mask);
-    m256 hi = and256(rshift64_m256(val, 4), mask);
-    m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, val);
-
-    m256 res_2 = or256(pshufb_m256(maskBase[2 * 2], lo),
-                       pshufb_m256(maskBase[2 * 2 + 1], hi));
-    m256 res_shifted_2 = vpalignr(res_2, *old_2, 16 - 2);
-    *old_2 = res_2;
-    return or256(r, res_shifted_2);
-}
-
-static really_inline
-m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2,
-                            m256 *old_3, m256 val) {
-    m256 mask = set1_32x8(0xf);
-    m256 lo = and256(val, mask);
-    m256 hi = and256(rshift64_m256(val, 4), mask);
-    m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, val);
-
-    m256 res_3 = or256(pshufb_m256(maskBase[3 * 2], lo),
-                       pshufb_m256(maskBase[3 * 2 + 1], hi));
-    m256 res_shifted_3 = vpalignr(res_3, *old_3, 16 - 3);
-    *old_3 = res_3;
-    return or256(r, res_shifted_3);
-}
-
-#define FDR_EXEC_FAT_TEDDY_RES_OLD_1                                        \
-do {                                                                        \
-} while(0)
-
-#define FDR_EXEC_FAT_TEDDY_RES_OLD_2                                        \
-    m256 res_old_1 = zeroes256();
-
-#define FDR_EXEC_FAT_TEDDY_RES_OLD_3                                        \
-    m256 res_old_1 = zeroes256();                                           \
-    m256 res_old_2 = zeroes256();
-
-#define FDR_EXEC_FAT_TEDDY_RES_OLD_4                                        \
-    m256 res_old_1 = zeroes256();                                           \
-    m256 res_old_2 = zeroes256();                                           \
-    m256 res_old_3 = zeroes256();
-
-#define FDR_EXEC_FAT_TEDDY_RES_OLD(n) FDR_EXEC_FAT_TEDDY_RES_OLD_##n
-
-#define PREP_CONF_FAT_FN_1(mask_base, val)                                  \
-    prep_conf_fat_teddy_m1(mask_base, val)
-
-#define PREP_CONF_FAT_FN_2(mask_base, val)                                  \
-    prep_conf_fat_teddy_m2(mask_base, &res_old_1, val)
-
-#define PREP_CONF_FAT_FN_3(mask_base, val)                                  \
-    prep_conf_fat_teddy_m3(mask_base, &res_old_1, &res_old_2, val)
-
-#define PREP_CONF_FAT_FN_4(mask_base, val)                                  \
-    prep_conf_fat_teddy_m4(mask_base, &res_old_1, &res_old_2, &res_old_3, val)
-
-#define PREP_CONF_FAT_FN(mask_base, val, n)                                 \
-    PREP_CONF_FAT_FN_##n(mask_base, val)
-
-#define FDR_EXEC_FAT_TEDDY(fdr, a, control, n_msk, conf_fn)                 \
-do {                                                                        \
-    const u8 *buf_end = a->buf + a->len;                                    \
-    const u8 *ptr = a->buf + a->start_offset;                               \
-    u32 floodBackoff = FLOOD_BACKOFF_START;                                 \
-    const u8 *tryFloodDetect = a->firstFloodDetect;                         \
-    u32 last_match = ones_u32;                                              \
-    const struct Teddy *teddy = (const struct Teddy *)fdr;                  \
-    const size_t iterBytes = 32;                                            \
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",               \
-                 a->buf, a->len, a->start_offset);                          \
-                                                                            \
-    const m256 *maskBase = getMaskBase_fat(teddy);                          \
-    const u32 *confBase = getConfBase(teddy);                               \
-                                                                            \
-    FDR_EXEC_FAT_TEDDY_RES_OLD(n_msk);                                      \
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);                             \
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);         \
-    if (ptr < mainStart) {                                                  \
-        ptr = mainStart - 16;                                               \
-        m256 p_mask;                                                        \
-        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->start_offset,       \
-                                       a->buf, buf_end,                     \
-                                       a->buf_history, a->len_history,      \
-                                       n_msk);                              \
-        m256 r_0 = PREP_CONF_FAT_FN(maskBase, val_0, n_msk);                \
-        r_0 = or256(r_0, p_mask);                                           \
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn);                  \
-        ptr += 16;                                                          \
-    }                                                                       \
-                                                                            \
-    if (ptr + 16 <= buf_end) {                                              \
-        m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk);       \
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn);                  \
-        ptr += 16;                                                          \
-    }                                                                       \
-                                                                            \
-    for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {                 \
-        __builtin_prefetch(ptr + (iterBytes * 4));                          \
-        CHECK_FLOOD;                                                        \
-        m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk);       \
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn);               \
-        m256 r_1 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr + 16), n_msk);  \
-        CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, conf_fn);              \
-    }                                                                       \
-                                                                            \
-    if (ptr + 16 <= buf_end) {                                              \
-        m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk);       \
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn);               \
-        ptr += 16;                                                          \
-    }                                                                       \
-                                                                            \
-    assert(ptr + 16 > buf_end);                                             \
-    if (ptr < buf_end) {                                                    \
-        m256 p_mask;                                                        \
-        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, 0, ptr, buf_end,       \
-                                       a->buf_history, a->len_history,      \
-                                       n_msk);                              \
-        m256 r_0 = PREP_CONF_FAT_FN(maskBase, val_0, n_msk);                \
-        r_0 = or256(r_0, p_mask);                                           \
-        CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn);                  \
-    }                                                                       \
-                                                                            \
-    return HWLM_SUCCESS;                                                    \
-} while(0)
-
-#endif // HAVE_AVX512VBMI
-
-hwlm_error_t fdr_exec_fat_teddy_msks1(const struct FDR *fdr,
-                                      const struct FDR_Runtime_Args *a,
-                                      hwlm_group_t control) {
-    FDR_EXEC_FAT_TEDDY(fdr, a, control, 1, do_confWithBit_teddy);
-}
-
-hwlm_error_t fdr_exec_fat_teddy_msks1_pck(const struct FDR *fdr,
-                                          const struct FDR_Runtime_Args *a,
-                                          hwlm_group_t control) {
-    FDR_EXEC_FAT_TEDDY(fdr, a, control, 1, do_confWithBit_teddy);
-}
-
-hwlm_error_t fdr_exec_fat_teddy_msks2(const struct FDR *fdr,
-                                      const struct FDR_Runtime_Args *a,
-                                      hwlm_group_t control) {
-    FDR_EXEC_FAT_TEDDY(fdr, a, control, 2, do_confWithBit_teddy);
-}
-
-hwlm_error_t fdr_exec_fat_teddy_msks2_pck(const struct FDR *fdr,
-                                          const struct FDR_Runtime_Args *a,
-                                          hwlm_group_t control) {
-    FDR_EXEC_FAT_TEDDY(fdr, a, control, 2, do_confWithBit_teddy);
-}
-
-hwlm_error_t fdr_exec_fat_teddy_msks3(const struct FDR *fdr,
-                                      const struct FDR_Runtime_Args *a,
-                                      hwlm_group_t control) {
-    FDR_EXEC_FAT_TEDDY(fdr, a, control, 3, do_confWithBit_teddy);
-}
-
-hwlm_error_t fdr_exec_fat_teddy_msks3_pck(const struct FDR *fdr,
-                                          const struct FDR_Runtime_Args *a,
-                                          hwlm_group_t control) {
-    FDR_EXEC_FAT_TEDDY(fdr, a, control, 3, do_confWithBit_teddy);
-}
-
-hwlm_error_t fdr_exec_fat_teddy_msks4(const struct FDR *fdr,
-                                      const struct FDR_Runtime_Args *a,
-                                      hwlm_group_t control) {
-    FDR_EXEC_FAT_TEDDY(fdr, a, control, 4, do_confWithBit_teddy);
-}
-
-hwlm_error_t fdr_exec_fat_teddy_msks4_pck(const struct FDR *fdr,
-                                          const struct FDR_Runtime_Args *a,
-                                          hwlm_group_t control) {
-    FDR_EXEC_FAT_TEDDY(fdr, a, control, 4, do_confWithBit_teddy);
-}
-
-#endif // HAVE_AVX2
--- a/src/fdr/teddy_compile.cpp
+++ b/src/fdr/teddy_compile.cpp
@ -328,7 +328,7 @@ bool pack(const vector<hwlmLiteral> &lits,

 static
 void initReinforcedTable(u8 *rmsk) {
-    u64a *mask = (u64a *)rmsk;
+    u64a *mask = reinterpret_cast<u64a *>(rmsk);
    fill_n(mask, N_CHARS, 0x00ffffffffffffffULL);
 }

@ -576,8 +576,8 @@ bytecode_ptr<FDR> TeddyCompiler::build() {

    auto fdr = make_zeroed_bytecode_ptr<FDR>(size, 64);
    assert(fdr); // otherwise would have thrown std::bad_alloc
-    Teddy *teddy = (Teddy *)fdr.get(); // ugly
-    u8 *teddy_base = (u8 *)teddy;
+    Teddy *teddy = reinterpret_cast<Teddy *>(fdr.get()); // ugly
+    u8 *teddy_base = reinterpret_cast<u8 *>(teddy);

    // Write header.
    teddy->size = size;
@ -597,7 +597,7 @@ bytecode_ptr<FDR> TeddyCompiler::build() {
    assert(ISALIGNED_CL(ptr));
    teddy->floodOffset = verify_u32(ptr - teddy_base);
    memcpy(ptr, floodTable.get(), floodTable.size());
-    ptr += floodTable.size();
+    

    // Write teddy masks.
    u8 *baseMsk = teddy_base + ROUNDUP_CL(headerSize);
@ -622,7 +622,7 @@ bytecode_ptr<FDR> TeddyCompiler::build() {
 static
 bool assignStringsToBuckets(
                const vector<hwlmLiteral> &lits,
-                TeddyEngineDescription &eng,
+                const TeddyEngineDescription &eng,
                map<BucketIndex, vector<LiteralIndex>> &bucketToLits) {
    assert(eng.numMasks <= MAX_NUM_MASKS);
    if (lits.size() > eng.getNumBuckets() * TEDDY_BUCKET_LOAD) {
--- a/src/fdr/teddy_engine_description.cpp
+++ b/src/fdr/teddy_engine_description.cpp
@ -52,14 +52,14 @@ u32 TeddyEngineDescription::getDefaultFloodSuffixLength() const {

 void getTeddyDescriptions(vector<TeddyEngineDescription> *out) {
    static const TeddyEngineDef defns[] = {
-        { 3, 0 | HS_CPU_FEATURES_AVX2, 1, 16, false },
-        { 4, 0 | HS_CPU_FEATURES_AVX2, 1, 16, true },
-        { 5, 0 | HS_CPU_FEATURES_AVX2, 2, 16, false },
-        { 6, 0 | HS_CPU_FEATURES_AVX2, 2, 16, true },
-        { 7, 0 | HS_CPU_FEATURES_AVX2, 3, 16, false },
-        { 8, 0 | HS_CPU_FEATURES_AVX2, 3, 16, true },
-        { 9, 0 | HS_CPU_FEATURES_AVX2, 4, 16, false },
-        { 10, 0 | HS_CPU_FEATURES_AVX2, 4, 16, true },
+        { 3, HS_CPU_FEATURES_AVX2, 1, 16, false },
+        { 4, HS_CPU_FEATURES_AVX2, 1, 16, true },
+        { 5, HS_CPU_FEATURES_AVX2, 2, 16, false },
+        { 6, HS_CPU_FEATURES_AVX2, 2, 16, true },
+        { 7, HS_CPU_FEATURES_AVX2, 3, 16, false },
+        { 8, HS_CPU_FEATURES_AVX2, 3, 16, true },
+        { 9, HS_CPU_FEATURES_AVX2, 4, 16, false },
+        { 10, HS_CPU_FEATURES_AVX2, 4, 16, true },
        { 11, 0, 1, 8, false },
        { 12, 0, 1, 8, true },
        { 13, 0, 2, 8, false },
@ -71,6 +71,7 @@ void getTeddyDescriptions(vector<TeddyEngineDescription> *out) {
    };
    out->clear();
    for (const auto &def : defns) {
+        // cppcheck-suppress useStlAlgorithm
        out->emplace_back(def);
    }
 }
@ -123,6 +124,7 @@ bool isAllowed(const vector<hwlmLiteral> &vl, const TeddyEngineDescription &eng,
        u32 n_small_lits = 0;
        for (const auto &lit : vl) {
            if (lit.s.length() < eng.numMasks) {
+                // cppcheck-suppress useStlAlgorithm
                n_small_lits++;
            }
        }
@ -204,6 +206,7 @@ unique_ptr<TeddyEngineDescription> getTeddyDescription(u32 engineID) {
    getTeddyDescriptions(&descs);

    for (const auto &desc : descs) {
+        // cppcheck-suppress useStlAlgorithm
        if (desc.getID() == engineID) {
            return std::make_unique<TeddyEngineDescription>(desc);
        }
--- a/src/fdr/teddy_engine_description.h
+++ b/src/fdr/teddy_engine_description.h
@ -39,7 +39,7 @@ namespace ue2 {

 #define TEDDY_BUCKET_LOAD 6

-struct TeddyEngineDef {
+struct TeddyEngineDef {     //NOLINT (clang-analyzer-optin.performance.Padding)
    u32 id;
    u64a cpu_features;
    u32 numMasks;
--- a/src/fdr/teddy_fat.cpp
+++ b/src/fdr/teddy_fat.cpp
@ -0,0 +1,570 @@
+/*
+ * Copyright (c) 2015-2020, Intel Corporation
+ * Copyright (c) 2024, VectorCamp PC
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* fat teddy for AVX2 and AVX512VBMI */
+
+#include "fdr_internal.h"
+#include "flood_runtime.h"
+#include "teddy.h"
+#include "teddy_internal.h"
+#include "teddy_runtime_common.h"
+#include "util/arch.h"
+#include "util/simd_utils.h"
+
+#if defined(HAVE_AVX2)
+
+#ifdef ARCH_64_BIT
+static really_inline
+hwlm_error_t conf_chunk_64(u64a chunk, u8 bucket, u8 offset,
+                           CautionReason reason, const u8 *pt,
+                           const u32* confBase,             
+                           const struct FDR_Runtime_Args *a,           
+                           hwlm_group_t *control,            
+                           u32 *last_match) {    
+    if (unlikely(chunk != ones_u64a)) {
+        chunk = ~chunk;
+        do_confWithBit_teddy(&chunk, bucket, offset, confBase, reason, a, pt,
+                control, last_match);
+        // adapted from CHECK_HWLM_TERMINATE_MATCHING    
+        if (unlikely(*control == HWLM_TERMINATE_MATCHING)) {
+            return HWLM_TERMINATED;
+        }
+
+    }
+    return HWLM_SUCCESS;
+}
+
+#define CONF_FAT_CHUNK_64(chunk, bucket, off, reason, pt, confBase, a, control, last_match) \
+ if(conf_chunk_64(chunk, bucket, off, reason, pt, confBase, a, control, last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
+#else
+static really_inline
+hwlm_error_t conf_chunk_32(u32 chunk, u8 bucket, u8 offset,
+                           CautionReason reason, const u8 *pt,
+                           const u32* confBase,
+                           const struct FDR_Runtime_Args *a,
+                           hwlm_group_t *control,
+                           u32 *last_match) {
+    if (unlikely(chunk != ones_u32)) {
+        chunk = ~chunk;
+        do_confWithBit_teddy(&chunk, bucket, offset, confBase, reason, a, pt,
+                control, last_match);
+        // adapted from CHECK_HWLM_TERMINATE_MATCHING
+        if (unlikely(*control == HWLM_TERMINATE_MATCHING)) {
+            return HWLM_TERMINATED;
+        }
+    }
+    return HWLM_SUCCESS;
+}
+
+
+#define CONF_FAT_CHUNK_32(chunk, bucket, off, reason, pt, confBase, a, control, last_match) \
+ if(conf_chunk_32(chunk, bucket, off, reason, pt, confBase, a, control, last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
+
+#endif
+
+
+#if defined(HAVE_AVX512VBMI) // VBMI strong teddy
+
+ // fat 512 teddy is only with vbmi
+
+static really_inline
+const m512 *getDupMaskBase(const struct Teddy *teddy, u8 numMask) {
+    return (const m512 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy))
+                          + ROUNDUP_CL(2 * numMask * sizeof(m256)));
+}
+
+
+const u8 ALIGN_CL_DIRECTIVE p_mask_interleave[64] = {
+    0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
+    8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
+    16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55,
+    24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63
+};
+
+#ifdef ARCH_64_BIT
+hwlm_error_t confirm_fat_teddy_64_512(m512 var, u8 bucket, u8 offset,
+                                  CautionReason reason, const u8 *ptr,
+                                  const struct FDR_Runtime_Args *a,
+                                  const u32* confBase, hwlm_group_t *control,
+                                  u32 *last_match) {
+    if (unlikely(diff512(var, ones512()))) {
+        m512 msk_interleave = load512(p_mask_interleave);
+        m512 r = vpermb512(msk_interleave, var);
+        m128 r0 = extract128from512(r, 0);
+        m128 r1 = extract128from512(r, 1);
+        m128 r2 = extract128from512(r, 2);
+        m128 r3 = extract128from512(r, 3);
+        u64a part1 = movq(r0);
+        u64a part2 = extract64from128(r0, 1);
+        u64a part3 = movq(r1);
+        u64a part4 = extract64from128(r1, 1);
+        u64a part5 = movq(r2);
+        u64a part6 = extract64from128(r2, 1);
+        u64a part7 = movq(r3);
+        u64a part8 = extract64from128(r3, 1);
+        CONF_FAT_CHUNK_64(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_64(part2, bucket, offset + 4, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_64(part3, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_64(part4, bucket, offset + 12, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_64(part5, bucket, offset + 16, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_64(part6, bucket, offset + 20, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_64(part7, bucket, offset + 24, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_64(part8, bucket, offset + 28, reason, ptr, confBase, a, control, last_match);
+    }
+    return HWLM_SUCCESS;
+}
+#define confirm_fat_teddy_512_f confirm_fat_teddy_64_512
+#else // 32-64
+
+hwlm_error_t confirm_fat_teddy_32_512(m512 var, u8 bucket, u8 offset,
+                                  CautionReason reason, const u8 *ptr,
+                                  const struct FDR_Runtime_Args *a,
+                                  const u32* confBase, hwlm_group_t *control,
+                                  u32 *last_match) {
+    if (unlikely(diff512(var, ones512()))) {
+        m512 msk_interleave = load512(p_mask_interleave);
+        m512 r = vpermb512(msk_interleave, var);
+        m128 r0 = extract128from512(r, 0);
+        m128 r1 = extract128from512(r, 1);
+        m128 r2 = extract128from512(r, 2);
+        m128 r3 = extract128from512(r, 3);
+        u32 part1 = movd(r0);
+        u32 part2 = extract32from128(r0, 1);
+        u32 part3 = extract32from128(r0, 2);
+        u32 part4 = extract32from128(r0, 3);
+        u32 part5 = movd(r1);
+        u32 part6 = extract32from128(r1, 1);
+        u32 part7 = extract32from128(r1, 2);
+        u32 part8 = extract32from128(r1, 3);
+        u32 part9 = movd(r2);
+        u32 part10 = extract32from128(r2, 1);
+        u32 part11 = extract32from128(r2, 2);
+        u32 part12 = extract32from128(r2, 3);
+        u32 part13 = movd(r3);
+        u32 part14 = extract32from128(r3, 1);
+        u32 part15 = extract32from128(r3, 2);
+        u32 part16 = extract32from128(r3, 3);
+        CONF_FAT_CHUNK_32(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part2, bucket, offset + 2, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part3, bucket, offset + 4, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part4, bucket, offset + 6, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part5, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part6, bucket, offset + 10, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part7, bucket, offset + 12, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part8, bucket, offset + 14, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part9, bucket, offset + 16, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part10, bucket, offset + 18, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part11, bucket, offset + 20, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part12, bucket, offset + 22, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part13, bucket, offset + 24, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part14, bucket, offset + 26, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part15, bucket, offset + 28, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part16, bucket, offset + 30, reason, ptr, confBase, a, control, last_match);
+    }
+    return HWLM_SUCCESS;
+}
+#define confirm_fat_teddy_512_f confirm_fat_teddy_32_512
+#endif // 32/64
+
+#define CONFIRM_FAT_TEDDY_512(...) if(confirm_fat_teddy_512_f(__VA_ARGS__, a, confBase, &control, &last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
+
+#define TEDDY_VBMI_SL1_MASK   0xfffffffffffffffeULL
+#define TEDDY_VBMI_SL2_MASK   0xfffffffffffffffcULL
+#define TEDDY_VBMI_SL3_MASK   0xfffffffffffffff8ULL
+
+#define FAT_TEDDY_VBMI_SL1_MASK   0xfffffffefffffffeULL
+#define FAT_TEDDY_VBMI_SL2_MASK   0xfffffffcfffffffcULL
+#define FAT_TEDDY_VBMI_SL3_MASK   0xfffffff8fffffff8ULL
+
+#define FAT_TEDDY_VBMI_SL1_POS    15
+#define FAT_TEDDY_VBMI_SL2_POS    14
+#define FAT_TEDDY_VBMI_SL3_POS    13
+
+#define FAT_TEDDY_VBMI_CONF_MASK_HEAD   (0xffffffffULL >> n_sh)
+#define FAT_TEDDY_VBMI_CONF_MASK_FULL   ((0xffffffffULL << n_sh) & 0xffffffffULL)
+#define FAT_TEDDY_VBMI_CONF_MASK_VAR(n) (0xffffffffULL >> (32 - n) << overlap)
+#define FAT_TEDDY_VBMI_LOAD_MASK_PATCH  (0xffffffffULL >> (32 - n_sh))
+
+template<int NMSK>
+static really_inline
+m512 prep_conf_fat_teddy_512vbmi_templ(const m512 *lo_mask, const m512 *dup_mask,
+                                       const m512 *sl_msk, const m512 val) {
+    m512 lo = and512(val, *lo_mask);
+    m512 hi = and512(rshift64_m512(val, 4), *lo_mask);
+    m512 shuf_or_b0 = or512(pshufb_m512(dup_mask[0], lo),
+                            pshufb_m512(dup_mask[1], hi));
+
+    if constexpr (NMSK == 1) return shuf_or_b0;
+    m512 shuf_or_b1 = or512(pshufb_m512(dup_mask[2], lo),
+                            pshufb_m512(dup_mask[3], hi));
+    m512 sl1 = maskz_vpermb512(FAT_TEDDY_VBMI_SL1_MASK, sl_msk[0], shuf_or_b1);
+    if constexpr (NMSK == 2) return (or512(sl1, shuf_or_b0));
+    m512 shuf_or_b2 = or512(pshufb_m512(dup_mask[4], lo),
+                            pshufb_m512(dup_mask[5], hi));
+    m512 sl2 = maskz_vpermb512(FAT_TEDDY_VBMI_SL2_MASK, sl_msk[1], shuf_or_b2);
+    if constexpr (NMSK == 3) return (or512(sl2, or512(sl1, shuf_or_b0)));
+    m512 shuf_or_b3 = or512(pshufb_m512(dup_mask[6], lo),
+                            pshufb_m512(dup_mask[7], hi));
+    m512 sl3 = maskz_vpermb512(FAT_TEDDY_VBMI_SL3_MASK, sl_msk[2], shuf_or_b3);
+    return (or512(sl3, or512(sl2, or512(sl1, shuf_or_b0))));
+}
+
+
+#define TEDDY_VBMI_SL1_POS    15
+#define TEDDY_VBMI_SL2_POS    14
+#define TEDDY_VBMI_SL3_POS    13
+
+#define TEDDY_VBMI_CONF_MASK_HEAD   (0xffffffffffffffffULL >> n_sh)
+#define TEDDY_VBMI_CONF_MASK_FULL   (0xffffffffffffffffULL << n_sh)
+#define TEDDY_VBMI_CONF_MASK_VAR(n) (0xffffffffffffffffULL >> (64 - n) << overlap)
+#define TEDDY_VBMI_LOAD_MASK_PATCH  (0xffffffffffffffffULL >> (64 - n_sh))
+
+template<int NMSK>
+hwlm_error_t fdr_exec_fat_teddy_512vbmi_templ(const struct FDR *fdr,
+                                              const struct FDR_Runtime_Args *a,
+                                              hwlm_group_t control) {
+    const u8 *buf_end = a->buf + a->len;
+    const u8 *ptr = a->buf + a->start_offset;
+    u32 floodBackoff = FLOOD_BACKOFF_START;
+    const u8 *tryFloodDetect = a->firstFloodDetect;
+    u32 last_match = ones_u32;
+    const struct Teddy *teddy = (const struct Teddy *)fdr;
+    const size_t iterBytes = 32;
+    u32 n_sh = NMSK - 1;
+    const size_t loopBytes = 32 - n_sh;
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+                 a->buf, a->len, a->start_offset);
+
+    const m512 *dup_mask = getDupMaskBase(teddy, NMSK);
+    m512 lo_mask = set1_64x8(0xf);
+    m512 sl_msk[NMSK - 1];
+    if constexpr (NMSK > 1){
+    sl_msk[0] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL1_POS);
+    }
+    if constexpr (NMSK > 2){
+    sl_msk[1] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL2_POS);
+    }
+    if constexpr (NMSK > 3){
+    sl_msk[2] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL3_POS);
+    }
+
+    const u32 *confBase = getConfBase(teddy);
+
+    u64a k = FAT_TEDDY_VBMI_CONF_MASK_FULL;
+    m512 p_mask = set_mask_m512(~((k << 32) | k));
+    u32 overlap = 0;
+    u64a patch = 0;
+    if (likely(ptr + loopBytes <= buf_end)) {
+        u64a k0 = FAT_TEDDY_VBMI_CONF_MASK_HEAD;
+        m512 p_mask0 = set_mask_m512(~((k0 << 32) | k0));
+        m512 r_0 = prep_conf_fat_teddy_512vbmi_templ<NMSK>(&lo_mask, dup_mask, sl_msk, set2x256(loadu_maskz_m256(k0, ptr)));
+
+        r_0 = or512(r_0, p_mask0);
+        CONFIRM_FAT_TEDDY_512(r_0, 16, 0, VECTORING, ptr);
+        ptr += loopBytes;
+        overlap = n_sh;
+        patch = FAT_TEDDY_VBMI_LOAD_MASK_PATCH;
+    }
+
+    for (; ptr + loopBytes <= buf_end; ptr += loopBytes) {
+        CHECK_FLOOD;
+        m512 r_0 = prep_conf_fat_teddy_512vbmi_templ<NMSK>(&lo_mask, dup_mask, sl_msk, set2x256(loadu256(ptr - n_sh)));
+        r_0 = or512(r_0, p_mask);
+        CONFIRM_FAT_TEDDY_512(r_0, 16, 0, NOT_CAUTIOUS, ptr - n_sh);
+    }
+
+    assert(ptr + loopBytes > buf_end);
+    if (ptr < buf_end) {
+        u32 left = (u32)(buf_end - ptr);
+        u64a k1 = FAT_TEDDY_VBMI_CONF_MASK_VAR(left);
+        m512 p_mask1 = set_mask_m512(~((k1 << 32) | k1));
+        m512 val_0 = set2x256(loadu_maskz_m256(k1 | patch, ptr - overlap));
+        m512 r_0 = prep_conf_fat_teddy_512vbmi_templ<NMSK>(&lo_mask, dup_mask, sl_msk, val_0);
+
+        r_0 = or512(r_0, p_mask1);
+        CONFIRM_FAT_TEDDY_512(r_0, 16, 0, VECTORING, ptr - overlap);
+    }
+
+    return HWLM_SUCCESS;
+}
+
+#define FDR_EXEC_FAT_TEDDY_FN fdr_exec_fat_teddy_512vbmi_templ
+
+
+#elif defined(HAVE_AVX2) // not HAVE_AVX512 but HAVE_AVX2 reinforced teddy
+
+
+#ifdef ARCH_64_BIT
+extern "C" {
+hwlm_error_t confirm_fat_teddy_64_256(m256 var, u8 bucket, u8 offset,
+                                      CautionReason reason, const u8 *ptr,
+                                      const struct FDR_Runtime_Args *a,
+                                      const u32* confBase, hwlm_group_t *control,
+                                      u32 *last_match) {
+    if (unlikely(diff256(var, ones256()))) {
+        m256 swap = swap128in256(var);
+        m256 r = interleave256lo(var, swap);
+        u64a part1 = extractlow64from256(r);
+        u64a part2 = extract64from256(r, 1);
+        r = interleave256hi(var, swap);
+        u64a part3 = extractlow64from256(r);
+        u64a part4 = extract64from256(r, 1);
+        CONF_FAT_CHUNK_64(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_64(part2, bucket, offset + 4, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_64(part3, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_64(part4, bucket, offset + 12, reason, ptr, confBase, a, control, last_match);
+    }
+    return HWLM_SUCCESS;
+}
+} // extern C
+
+#define confirm_fat_teddy_256_f confirm_fat_teddy_64_256
+
+#else
+extern "C" {
+hwlm_error_t confirm_fat_teddy_32_256(m256 var, u8 bucket, u8 offset,
+                                      CautionReason reason, const u8 *ptr,
+                                      const struct FDR_Runtime_Args *a,
+                                      const u32* confBase, hwlm_group_t *control,
+                                      u32 *last_match) {
+    if (unlikely(diff256(var, ones256()))) {
+        m256 swap = swap128in256(var);
+        m256 r = interleave256lo(var, swap);
+        u32 part1 = extractlow32from256(r);
+        u32 part2 = extract32from256(r, 1);
+        u32 part3 = extract32from256(r, 2);
+        u32 part4 = extract32from256(r, 3);
+        r = interleave256hi(var, swap);
+        u32 part5 = extractlow32from256(r);
+        u32 part6 = extract32from256(r, 1);
+        u32 part7 = extract32from256(r, 2);
+        u32 part8 = extract32from256(r, 3);
+        CONF_FAT_CHUNK_32(part1, bucket, offset, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part2, bucket, offset + 2, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part3, bucket, offset + 4, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part4, bucket, offset + 6, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part5, bucket, offset + 8, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part6, bucket, offset + 10, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part7, bucket, offset + 12, reason, ptr, confBase, a, control, last_match);
+        CONF_FAT_CHUNK_32(part8, bucket, offset + 14, reason, ptr, confBase, a, control, last_match);
+    }
+    return HWLM_SUCCESS;
+}
+
+} // extern C
+
+#define confirm_fat_teddy_256_f confirm_fat_teddy_32_256
+
+#endif
+
+#define CONFIRM_FAT_TEDDY_256(...) if(confirm_fat_teddy_256_f(__VA_ARGS__, a, confBase, &control, &last_match) == HWLM_TERMINATED)return HWLM_TERMINATED;
+
+static really_inline
+const m256 *getMaskBase_fat(const struct Teddy *teddy) {
+    return (const m256 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
+}
+
+
+static really_inline
+m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const size_t start_offset,
+                       const u8 *lo, const u8 *hi,
+                       const u8 *buf_history, size_t len_history,
+                       const u32 nMasks) {
+    m128 p_mask128;
+    m256 ret = set1_2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi,
+                                          buf_history, len_history, nMasks));
+    *p_mask = set1_2x128(p_mask128);
+    return ret;
+}
+
+template<int NMSK>
+static really_inline
+m256 prep_conf_fat_teddy_256_templ(const m256 *maskBase, m256 val,
+                                   m256* old_1, m256* old_2, m256* old_3){
+    m256 mask = set1_32x8(0xf);
+    m256 lo = and256(val, mask);
+    m256 hi = and256(rshift64_m256(val, 4), mask);
+    m256 r = or256(pshufb_m256(maskBase[0 * 2], lo),
+                     pshufb_m256(maskBase[0 * 2 + 1], hi));
+    if constexpr (NMSK == 1) return r;
+    m256 res_1 = or256(pshufb_m256(maskBase[(NMSK-1) * 2], lo),
+                       pshufb_m256(maskBase[(NMSK-1) * 2 + 1], hi));
+    m256 res_shifted_1 = vpalignr(res_1, *old_1, 16 - (NMSK-1));
+    *old_1 = res_1;
+    r = or256(r, res_shifted_1);
+    if constexpr (NMSK == 2) return r;
+    m256 res_2 = or256(pshufb_m256(maskBase[(NMSK-1) * 2], lo),
+                       pshufb_m256(maskBase[(NMSK-1) * 2 + 1], hi));
+    m256 res_shifted_2 = vpalignr(res_2, *old_2, 16 - (NMSK-1));
+    *old_2 = res_2;
+    r = or256(r, res_shifted_2);
+    if constexpr (NMSK == 3) return r;
+    m256 res_3 = or256(pshufb_m256(maskBase[(NMSK-1) * 2], lo),
+                       pshufb_m256(maskBase[(NMSK-1) * 2 + 1], hi));
+    m256 res_shifted_3 = vpalignr(res_3, *old_3, 16 - (NMSK-1));
+    *old_3 = res_3;
+    return or256(r, res_shifted_3);
+}
+
+template<int NMSK>
+hwlm_error_t fdr_exec_fat_teddy_256_templ(const struct FDR *fdr,
+                                          const struct FDR_Runtime_Args *a,
+                                          hwlm_group_t control) {
+    const u8 *buf_end = a->buf + a->len;
+    const u8 *ptr = a->buf + a->start_offset;
+    u32 floodBackoff = FLOOD_BACKOFF_START;
+    const u8 *tryFloodDetect = a->firstFloodDetect;
+    u32 last_match = ones_u32;
+    const struct Teddy *teddy = (const struct Teddy *)fdr;
+    const size_t iterBytes = 32;
+    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
+                 a->buf, a->len, a->start_offset);
+
+    const m256 *maskBase = getMaskBase_fat(teddy);
+    const u32 *confBase = getConfBase(teddy);
+
+    m256 res_old_1 = zeroes256();
+    m256 res_old_2 = zeroes256();
+    m256 res_old_3 = zeroes256();
+    const u8 *mainStart = ROUNDUP_PTR(ptr, 16);
+    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
+    if (ptr < mainStart) {
+        ptr = mainStart - 16;
+        m256 p_mask;
+        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->start_offset,
+                                       a->buf, buf_end,
+                                       a->buf_history, a->len_history,
+                                       NMSK);
+        m256 r_0 = prep_conf_fat_teddy_256_templ<NMSK>(maskBase, val_0, &res_old_1, &res_old_2, &res_old_3);
+        r_0 = or256(r_0, p_mask);
+        CONFIRM_FAT_TEDDY_256(r_0, 16, 0, VECTORING, ptr);
+        ptr += 16;
+    }
+
+    if (ptr + 16 <= buf_end) {
+        m256 r_0 = prep_conf_fat_teddy_256_templ<NMSK>(maskBase, load2x128(ptr), &res_old_1, &res_old_2, &res_old_3);
+        CONFIRM_FAT_TEDDY_256(r_0, 16, 0, VECTORING, ptr);
+        ptr += 16;
+    }
+
+    for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
+        __builtin_prefetch(ptr + (iterBytes * 4));
+        CHECK_FLOOD;
+        m256 r_0 = prep_conf_fat_teddy_256_templ<NMSK>(maskBase, load2x128(ptr), &res_old_1, &res_old_2, &res_old_3);
+        CONFIRM_FAT_TEDDY_256(r_0, 16, 0, NOT_CAUTIOUS, ptr);
+        m256 r_1 = prep_conf_fat_teddy_256_templ<NMSK>(maskBase, load2x128(ptr + 16), &res_old_1, &res_old_2, &res_old_3);
+        CONFIRM_FAT_TEDDY_256(r_1, 16, 16, NOT_CAUTIOUS, ptr);
+    }
+
+    if (ptr + 16 <= buf_end) {
+        m256 r_0 = prep_conf_fat_teddy_256_templ<NMSK>(maskBase, load2x128(ptr), &res_old_1, &res_old_2, &res_old_3);
+        CONFIRM_FAT_TEDDY_256(r_0, 16, 0, NOT_CAUTIOUS, ptr);
+        ptr += 16;
+    }
+
+    assert(ptr + 16 > buf_end);
+    if (ptr < buf_end) {
+        m256 p_mask;
+        m256 val_0 = vectoredLoad2x128(&p_mask, ptr, 0, ptr, buf_end,
+                                       a->buf_history, a->len_history,
+                                       NMSK);
+        m256 r_0 = prep_conf_fat_teddy_256_templ<NMSK>(maskBase, val_0, &res_old_1, &res_old_2, &res_old_3);
+        r_0 = or256(r_0, p_mask);
+        CONFIRM_FAT_TEDDY_256(r_0, 16, 0, VECTORING, ptr);
+    }
+    return HWLM_SUCCESS;
+}
+
+// this check is because it is possible to build with both AVX512VBMI and AVX2 defined,
+// to replicate the behaviour of the original flow of control we give preference
+// to the former. If we're building for both then this will be compiled multiple times
+// with the desired variant defined by itself.
+#ifndef FDR_EXEC_FAT_TEDDY_FN
+#define FDR_EXEC_FAT_TEDDY_FN fdr_exec_fat_teddy_256_templ
+#endif
+
+#endif // HAVE_AVX2 for fat teddy
+
+/* we only have fat teddy in these two modes */
+// #if (defined(HAVE_AVX2) || defined(HAVE_AVX512VBMI)) && defined(FDR_EXEC_FAT_TEDDY_FN)
+// #if defined(FDR_EXEC_FAT_TEDDY_FN)
+
+extern "C" {
+hwlm_error_t fdr_exec_fat_teddy_msks1(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
+    return FDR_EXEC_FAT_TEDDY_FN<1>(fdr, a, control);
+}
+
+hwlm_error_t fdr_exec_fat_teddy_msks1_pck(const struct FDR *fdr,
+                                          const struct FDR_Runtime_Args *a,
+                                          hwlm_group_t control) {
+    return FDR_EXEC_FAT_TEDDY_FN<1>(fdr, a, control);
+}
+
+hwlm_error_t fdr_exec_fat_teddy_msks2(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
+    return FDR_EXEC_FAT_TEDDY_FN<2>(fdr, a, control);
+}
+
+hwlm_error_t fdr_exec_fat_teddy_msks2_pck(const struct FDR *fdr,
+                                          const struct FDR_Runtime_Args *a,
+                                          hwlm_group_t control) {
+    return FDR_EXEC_FAT_TEDDY_FN<2>(fdr, a, control);
+}
+
+hwlm_error_t fdr_exec_fat_teddy_msks3(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
+    return FDR_EXEC_FAT_TEDDY_FN<3>(fdr, a, control);
+}
+
+hwlm_error_t fdr_exec_fat_teddy_msks3_pck(const struct FDR *fdr,
+                                          const struct FDR_Runtime_Args *a,
+                                          hwlm_group_t control) {
+    return FDR_EXEC_FAT_TEDDY_FN<3>(fdr, a, control);
+}
+
+hwlm_error_t fdr_exec_fat_teddy_msks4(const struct FDR *fdr,
+                                      const struct FDR_Runtime_Args *a,
+                                      hwlm_group_t control) {
+    return FDR_EXEC_FAT_TEDDY_FN<4>(fdr, a, control);
+}
+
+hwlm_error_t fdr_exec_fat_teddy_msks4_pck(const struct FDR *fdr,
+                                          const struct FDR_Runtime_Args *a,
+                                          hwlm_group_t control) {
+    return FDR_EXEC_FAT_TEDDY_FN<4>(fdr, a, control);
+}
+
+} // extern c
+
+#endif // HAVE_AVX2 from the beginning
+
--- a/src/fdr/teddy_runtime_common.h
+++ b/src/fdr/teddy_runtime_common.h
@ -1,5 +1,6 @@
 /*
 * Copyright (c) 2016-2020, Intel Corporation
+ * Copyright (c) 2024, VectorCamp PC
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -40,10 +41,6 @@
 #include "util/simd_utils.h"
 #include "util/uniform_ops.h"

-extern const u8 ALIGN_DIRECTIVE p_mask_arr[17][32];
-#if defined(HAVE_AVX2)
-extern const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64];
-#endif

 #if defined(HAVE_AVX512VBMI)
 static const u8 ALIGN_DIRECTIVE p_sh_mask_arr[80] = {
@ -142,6 +139,37 @@ void copyRuntBlock128(u8 *dst, const u8 *src, size_t len) {
 //          |----------|-------|----------------|............|
 //          0          start   start+offset     end(<=16)
 // p_mask   ffff.....ffffff..ff0000...........00ffff..........
+
+// replace the p_mask_arr table.
+// m is the length of the zone of bytes==0 , n is
+// the offset where that zone begins. more specifically, there are
+// 16-n bytes of 1's before the zone begins.
+// m,n 4,7  - 4 bytes of 0s, and 16-7 bytes of 1's before that.
+// 00 00 00 00 ff..ff
+// ff ff ff ff ff ff ff ff 00 00 00 00 ff..ff
+// m,n 15,15 - 15 bytes of 0s , f's high, but also with 16-15=1 byte of 1s
+// in the beginning - which push the ff at the end off the high end , leaving
+// ff 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+// m,n 15,16 - 15 bytes of 0s, ff high , with 16-16 = 0 ones on the low end
+// before that, so,
+// 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ff
+// so to get the one part, with the f's high, we start out with 1's and
+// shift them up (right) by m+n.
+// now to fill in any ones that belong on the low end we have to take
+// some 1's and shift them down. the ones zone there needs to be 16-n long,
+// meaning shifted down by 16-(16-n) , or of course just n.
+// then we should be able to or these together.
+static really_inline
+m128 p_mask_gen(u8 m, u8 n){
+    m128 a = ones128();
+    m128 b = ones128();
+    m%=17; n%=17;
+    m+=(16-n); m%=17;
+    a = rshiftbyte_m128(a, n);
+    b = lshiftbyte_m128(b, m);
+    return or128(a, b);
+}
+
 static really_inline
 m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const size_t start_offset,
                     const u8 *lo, const u8 *hi,
@ -161,13 +189,11 @@ m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const size_t start_offset,
        uintptr_t avail = (uintptr_t)(hi - ptr);
        if (avail >= 16) {
            assert(start_offset - start <= 16);
-            *p_mask = loadu128(p_mask_arr[16 - start_offset + start]
-                               + 16 - start_offset + start);
+            *p_mask = p_mask_gen(16 - start_offset + start, 16 - start_offset + start);
            return loadu128(ptr);
        }
        assert(start_offset - start <= avail);
-        *p_mask = loadu128(p_mask_arr[avail - start_offset + start]
-                           + 16 - start_offset + start);
+        *p_mask = p_mask_gen(avail - start_offset + start, 16 - start_offset + start);
        copy_start = 0;
        copy_len = avail;
    } else { // start zone
@ -180,8 +206,7 @@ m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const size_t start_offset,
        }
        uintptr_t end = MIN(16, (uintptr_t)(hi - ptr));
        assert(start + start_offset <= end);
-        *p_mask = loadu128(p_mask_arr[end - start - start_offset]
-                           + 16 - start - start_offset);
+        *p_mask = p_mask_gen(end - start - start_offset, 16 - start - start_offset);
        copy_start = start;
        copy_len = end - start;
    }
@ -270,6 +295,20 @@ void copyRuntBlock256(u8 *dst, const u8 *src, size_t len) {
 //          |----------|-------|----------------|............|
 //          0          start   start+offset     end(<=32)
 // p_mask   ffff.....ffffff..ff0000...........00ffff..........
+
+// like the pmask gen above this replaces the large array.
+static really_inline
+m256 fat_pmask_gen(u8 m, u8 n){
+    m256 a=ones256();
+    m256 b=ones256();
+    m%=33; n%=33;
+    m+=(32-n); m%=33;
+
+    a = rshift_byte_m256(a, m);
+    b = lshift_byte_m256(b, n);
+    return or256(a, b);
+}
+
 static really_inline
 m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const size_t start_offset,
                     const u8 *lo, const u8 *hi,
@ -289,13 +328,11 @@ m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const size_t start_offset,
        uintptr_t avail = (uintptr_t)(hi - ptr);
        if (avail >= 32) {
            assert(start_offset - start <= 32);
-            *p_mask = loadu256(p_mask_arr256[32 - start_offset + start]
-                               + 32 - start_offset + start);
+            *p_mask = fat_pmask_gen(32 - start_offset + start, 32 - start_offset + start);
            return loadu256(ptr);
        }
        assert(start_offset - start <= avail);
-        *p_mask = loadu256(p_mask_arr256[avail - start_offset + start]
-                           + 32 - start_offset + start);
+        *p_mask = fat_pmask_gen(avail - start_offset + start, 32 - start_offset + start);
        copy_start = 0;
        copy_len = avail;
    } else { //start zone
@ -308,8 +345,7 @@ m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const size_t start_offset,
        }
        uintptr_t end = MIN(32, (uintptr_t)(hi - ptr));
        assert(start + start_offset <= end);
-        *p_mask = loadu256(p_mask_arr256[end - start - start_offset]
-                           + 32 - start - start_offset);
+        *p_mask = fat_pmask_gen(end - start - start_offset, 32 - start - start_offset);
        copy_start = start;
        copy_len = end - start;
    }
@ -428,8 +464,13 @@ void do_confWithBit_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
        if (!cf) {
            continue;
        }
+#ifdef __cplusplus
+        const struct FDRConfirm *fdrc = reinterpret_cast<const struct FDRConfirm *>
+                                        (reinterpret_cast<const u8 *>(confBase) + cf);
+#else
        const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
                                        ((const u8 *)confBase + cf);
+#endif
        if (!(fdrc->groups & *control)) {
            continue;
        }
@ -442,18 +483,31 @@ void do_confWithBit_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,

 static really_inline
 const m128 *getMaskBase(const struct Teddy *teddy) {
+#ifdef __cplusplus
+    return reinterpret_cast<const m128 *>(reinterpret_cast<const u8 *>(teddy) + ROUNDUP_CL(sizeof(struct Teddy)));
+#else
    return (const m128 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)));
+#endif
 }

 static really_inline
 const u64a *getReinforcedMaskBase(const struct Teddy *teddy, u8 numMask) {
+#ifdef __cplusplus
+    return reinterpret_cast<const u64a *>(reinterpret_cast<const u8 *>(getMaskBase(teddy))
+                          + ROUNDUP_CL(2 * numMask * sizeof(m128)));
+#else
    return (const u64a *)((const u8 *)getMaskBase(teddy)
                          + ROUNDUP_CL(2 * numMask * sizeof(m128)));
+#endif
 }

 static really_inline
 const u32 *getConfBase(const struct Teddy *teddy) {
+#ifdef __cplusplus
+    return reinterpret_cast<const u32 *>(reinterpret_cast<const u8 *>(teddy) + teddy->confOffset);
+#else
    return (const u32 *)((const u8 *)teddy + teddy->confOffset);
+#endif
 }

 #endif /* TEDDY_RUNTIME_COMMON_H_ */
--- a/src/hs.cpp
+++ b/src/hs.cpp
@ -589,7 +589,7 @@ hs_error_t hs_expression_info_int(const char *expression, unsigned int flags,
        return HS_COMPILER_ERROR;
    }

-    hs_expr_info *rv = (hs_expr_info *)hs_misc_alloc(sizeof(*rv));
+    hs_expr_info *rv = static_cast<hs_expr_info *>(hs_misc_alloc(sizeof(*rv)));
    if (!rv) {
        *error = const_cast<hs_compile_error_t *>(&hs_enomem);
        return HS_COMPILER_ERROR;
--- a/src/hs_valid_platform.c
+++ b/src/hs_valid_platform.c
@ -1,5 +1,6 @@
 /*
 * Copyright (c) 2016-2017, Intel Corporation
+ * Copyright (c) 2020-2023, VectorCamp PC
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -29,28 +30,33 @@
 #include "config.h"
 #include "hs_common.h"
 #include "ue2common.h"
+#if !defined(VS_SIMDE_BACKEND)
 #if defined(ARCH_IA32) || defined(ARCH_X86_64)
 #include "util/arch/x86/cpuid_inline.h"
 #elif defined(ARCH_AARCH64)
 #include "util/arch/arm/cpuid_inline.h"
 #endif
+#endif

 HS_PUBLIC_API
 hs_error_t HS_CDECL hs_valid_platform(void) {
-    /* Hyperscan requires SSSE3, anything else is a bonus */
-#if defined(ARCH_IA32) || defined(ARCH_X86_64)
-    if (check_ssse3()) {
+    /* Vectorscan requires SSE4.2, anything else is a bonus */
+#if !defined(VS_SIMDE_BACKEND) && (defined(ARCH_IA32) || defined(ARCH_X86_64))
+    // cppcheck-suppress knownConditionTrueFalse
+    if (check_sse42()) {
        return HS_SUCCESS;
    } else {
        return HS_ARCH_ERROR;
    }
-#elif defined(ARCH_ARM32) || defined(ARCH_AARCH64)
+#elif !defined(VS_SIMDE_BACKEND) && (defined(ARCH_ARM32) || defined(ARCH_AARCH64))
+   //check_neon returns true for now
+   // cppcheck-suppress knownConditionTrueFalse
   if (check_neon()) {
        return HS_SUCCESS;
    } else {
        return HS_ARCH_ERROR;
    }
-#elif defined(ARCH_PPC64EL)
+#elif defined(ARCH_PPC64EL) || defined(VS_SIMDE_BACKEND)
    return HS_SUCCESS;
 #endif
 }
--- a/src/hwlm/hwlm.c
+++ b/src/hwlm/hwlm.c
@ -73,7 +73,12 @@ const u8 *run_hwlm_accel(const union AccelAux *aux, const u8 *ptr,
        return shuftiExec(aux->shufti.lo, aux->shufti.hi, ptr, end);
    case ACCEL_TRUFFLE:
        DEBUG_PRINTF("truffle\n");
-        return truffleExec(aux->truffle.mask1, aux->truffle.mask2, ptr, end);
+        return truffleExec(aux->truffle.mask_lo, aux->truffle.mask_hi, ptr, end);
+#ifdef CAN_USE_WIDE_TRUFFLE
+    case ACCEL_TRUFFLE_WIDE:
+        DEBUG_PRINTF("truffle wide\n");
+        return truffleExecWide(aux->truffle.mask, ptr, end);
+#endif // CAN_USE_WIDE_TRUFFLE
    default:
        /* no acceleration, fall through and return current ptr */
        DEBUG_PRINTF("no accel; %u\n", (int)aux->accel_type);
@ -170,8 +175,7 @@ void do_accel_streaming(const union AccelAux *aux, const u8 *hbuf, size_t hlen,
        DEBUG_PRINTF("got %zu/%zu in 2nd buffer\n", delta, len);
        *start += delta;
    } else if (hlen) {
-        UNUSED size_t remaining = offset + ptr2 - found;
-        DEBUG_PRINTF("got %zu/%zu remaining in 1st buffer\n", remaining, hlen);
+        DEBUG_PRINTF("got %zu/%zu remaining in 1st buffer\n", offset + ptr2 - found, hlen);
    }
 }

--- a/src/hwlm/hwlm_build.cpp
+++ b/src/hwlm/hwlm_build.cpp
@ -93,6 +93,7 @@ void dumpLits(UNUSED const vector<hwlmLiteral> &lits) {
 // Called by an assertion.
 static
 bool everyoneHasGroups(const vector<hwlmLiteral> &lits) {
+    // cppcheck-suppress useStlAlgorithm
    for (const auto &lit : lits) {
        if (!lit.groups) {
            return false;
@ -143,7 +144,7 @@ bytecode_ptr<HWLM> hwlmBuild(const HWLMProto &proto, const CompileContext &cc,
    }

    if (!eng) {
-        return nullptr;
+        return bytecode_ptr<HWLM>(nullptr);
    }

    assert(engSize);
@ -155,6 +156,7 @@ bytecode_ptr<HWLM> hwlmBuild(const HWLMProto &proto, const CompileContext &cc,
    auto h = make_zeroed_bytecode_ptr<HWLM>(hwlm_len, 64);

    h->type = proto.engType;
+    // cppcheck-suppress cstyleCast
    memcpy(HWLM_DATA(h.get()), eng.get(), engSize);

    return h;
@ -218,10 +220,12 @@ size_t hwlmSize(const HWLM *h) {

    switch (h->type) {
    case HWLM_ENGINE_NOOD:
-        engSize = noodSize((const noodTable *)HWLM_C_DATA(h));
+	// cppcheck-suppress cstyleCast
+        engSize = noodSize(reinterpret_cast<const noodTable *>(HWLM_C_DATA(h)));
        break;
    case HWLM_ENGINE_FDR:
-        engSize = fdrSize((const FDR *)HWLM_C_DATA(h));
+	// cppcheck-suppress cstyleCast
+        engSize = fdrSize(reinterpret_cast<const FDR *>(HWLM_C_DATA(h)));
        break;
    }

--- a/src/hwlm/hwlm_dump.cpp
+++ b/src/hwlm/hwlm_dump.cpp
@ -53,10 +53,12 @@ void hwlmGenerateDumpFiles(const HWLM *h, const string &base) {

    switch (h->type) {
    case HWLM_ENGINE_NOOD:
-        noodPrintStats((const noodTable *)HWLM_C_DATA(h), f);
+        // cppcheck-suppress cstyleCast
+        noodPrintStats(reinterpret_cast<const noodTable *>(HWLM_C_DATA(h)), f);
        break;
    case HWLM_ENGINE_FDR:
-        fdrPrintStats((const FDR *)HWLM_C_DATA(h), f);
+        // cppcheck-suppress cstyleCast
+        fdrPrintStats(reinterpret_cast<const FDR *>(HWLM_C_DATA(h)), f);
        break;
    default:
        fprintf(f, "<unknown hwlm subengine>\n");
--- a/src/hwlm/noodle_build.cpp
+++ b/src/hwlm/noodle_build.cpp
@ -56,7 +56,7 @@ u64a make_u64a_mask(const vector<u8> &v) {

    u64a mask = 0;
    size_t len = v.size();
-    unsigned char *m = (unsigned char *)&mask;
+    u8 *m = reinterpret_cast<u8 *>(&mask);
    DEBUG_PRINTF("making mask len %zu\n", len);
    memcpy(m, &v[0], len);
    return mask;
@ -156,7 +156,7 @@ void noodPrintStats(const noodTable *n, FILE *f) {
            n->msk_len);
    fprintf(f, "String: ");
    for (u32 i = 0; i < n->msk_len; i++) {
-        const u8 *m = (const u8 *)&n->cmp;
+        const u8 *m = reinterpret_cast<const u8 *>(&n->cmp);
        if (isgraph(m[i]) && m[i] != '\\') {
            fprintf(f, "%c", m[i]);
        } else {
--- a/src/hwlm/noodle_engine_sve.hpp
+++ b/src/hwlm/noodle_engine_sve.hpp
@ -148,15 +148,14 @@ hwlm_error_t doubleCheckMatched(const struct noodTable *n, const u8 *buf,
 }

 static really_inline
-svbool_t doubleMatched(svuint16_t chars, const u8 *d,
-                       svbool_t pg, svbool_t pg_rot,
+svbool_t doubleMatchedLoop(svuint16_t chars, const u8 *d,
                       svbool_t * const matched, svbool_t * const matched_rot) {
-    svuint16_t vec = svreinterpret_u16(svld1_u8(pg, d));
+    svuint16_t vec = svreinterpret_u16(svld1_u8(svptrue_b8(), d));
    // d - 1 won't underflow as the first position in buf has been dealt
    // with meaning that d > buf
-    svuint16_t vec_rot = svreinterpret_u16(svld1_u8(pg_rot, d - 1));
-    *matched = svmatch(pg, vec, chars);
-    *matched_rot = svmatch(pg_rot, vec_rot, chars);
+    svuint16_t vec_rot = svreinterpret_u16(svld1_u8(svptrue_b8(), d - 1));
+    *matched = svmatch(svptrue_b8(), vec, chars);
+    *matched_rot = svmatch(svptrue_b8(), vec_rot, chars);
    return svorr_z(svptrue_b8(), *matched, *matched_rot);
 }

@ -167,10 +166,34 @@ hwlm_error_t scanDoubleOnce(const struct noodTable *n, const u8 *buf,
    DEBUG_PRINTF("start %p end %p\n", d, e);
    assert(d < e);
    assert(d > buf);
-    svbool_t pg = svwhilelt_b8_s64(0, e - d);
-    svbool_t pg_rot = svwhilelt_b8_s64(0, e - d + 1);
-    svbool_t matched, matched_rot;
-    svbool_t any = doubleMatched(svreinterpret_u16(chars), d, pg, pg_rot, &matched, &matched_rot);
+    const ptrdiff_t size = e - d;
+    svbool_t pg = svwhilelt_b8_s64(0, size);
+    svbool_t pg_rot = svwhilelt_b8_s64(0, size + 1);
+
+    svuint16_t vec = svreinterpret_u16(svld1_u8(pg, d));
+    // d - 1 won't underflow as the first position in buf has been dealt
+    // with meaning that d > buf
+    svuint16_t vec_rot = svreinterpret_u16(svld1_u8(pg_rot, d - 1));
+
+    // we reuse u8 predicates for u16 lanes. This means that we will check against one
+    // extra \0 character at the end of the vector.
+    if(unlikely(n->key1 == '\0')) {
+        if (size % 2) {
+            // if odd, vec has an odd number of lanes and has the spurious \0
+            svbool_t lane_to_disable = svrev_b8(svpfirst(svrev_b8(pg), svpfalse()));
+            pg = sveor_z(svptrue_b8(), pg, lane_to_disable);
+        } else {
+            // if even, vec_rot has an odd number of lanes and has the spurious \0
+            // we need to disable the last active lane as well, but we know pg is
+            // the same as pg_rot without the last lane
+            pg_rot = pg;
+        }
+    }
+
+    svbool_t matched = svmatch(pg, vec, svreinterpret_u16(chars));
+    svbool_t matched_rot = svmatch(pg_rot, vec_rot, svreinterpret_u16(chars));
+    svbool_t any = svorr_z(svptrue_b8(), matched, matched_rot);
+
    return doubleCheckMatched(n, buf, len, cbi, d, matched, matched_rot, any);
 }

@ -187,8 +210,7 @@ hwlm_error_t scanDoubleLoop(const struct noodTable *n, const u8 *buf,
    for (size_t i = 0; i < loops; i++, d += svcntb()) {
        DEBUG_PRINTF("d %p \n", d);
        svbool_t matched, matched_rot;
-        svbool_t any = doubleMatched(svreinterpret_u16(chars), d, svptrue_b8(), svptrue_b8(),
-                                     &matched, &matched_rot);
+        svbool_t any = doubleMatchedLoop(svreinterpret_u16(chars), d, &matched, &matched_rot);
        hwlm_error_t rv = doubleCheckMatched(n, buf, len, cbi, d,
                                             matched, matched_rot, any);
        RETURN_IF_TERMINATED(rv);
--- a/src/nfa/accel.c
+++ b/src/nfa/accel.c
@ -142,9 +142,18 @@ const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end) {
            return c;
        }

-        rv = truffleExec(accel->truffle.mask1, accel->truffle.mask2, c, c_end);
+        rv = truffleExec(accel->truffle.mask_lo, accel->truffle.mask_hi, c, c_end);
        break;
+#ifdef CAN_USE_WIDE_TRUFFLE
+    case ACCEL_TRUFFLE_WIDE:
+        DEBUG_PRINTF("accel Truffle Wide %p %p\n", c, c_end);
+        if (c + 15 >= c_end) {
+            return c;
+        }

+        rv = truffleExecWide(accel->truffle.mask, c, c_end);
+        break;
+#endif
    case ACCEL_DSHUFTI:
        DEBUG_PRINTF("accel dshufti %p %p\n", c, c_end);
        if (c + 15 + 1 >= c_end) {
--- a/src/nfa/accel.h
+++ b/src/nfa/accel.h
@ -66,6 +66,7 @@ enum AccelType {
    ACCEL_VERM16,
    ACCEL_DVERM16,
    ACCEL_DVERM16_MASKED,
+    ACCEL_TRUFFLE_WIDE,
 };

 /** \brief Structure for accel framework. */
@ -136,8 +137,18 @@ union AccelAux {
    struct {
        u8 accel_type;
        u8 offset;
-        m128 mask1;
-        m128 mask2;
+        union {
+            m256 mask;
+            struct {
+#if (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
+                m128 mask_lo;
+                m128 mask_hi;
+#else
+                m128 mask_hi;
+                m128 mask_lo;
+#endif
+            };
+        };
    } truffle;
 };

--- a/src/nfa/accel_dfa_build_strat.cpp
+++ b/src/nfa/accel_dfa_build_strat.cpp
@ -182,6 +182,7 @@ vector<vector<CharReach>> generate_paths(const raw_dfa &rdfa,
    vector<vector<CharReach>> rv;
    rv.reserve(paths.size());
    for (auto &p : paths) {
+        // cppcheck-suppress useStlAlgorithm
        rv.emplace_back(vector<CharReach>(std::make_move_iterator(p.reach.begin()),
                                       std::make_move_iterator(p.reach.end())));
    }
@ -426,10 +427,11 @@ void
 accel_dfa_build_strat::buildAccel(UNUSED dstate_id_t this_idx,
                                  const AccelScheme &info,
                                  void *accel_out) {
-    AccelAux *accel = (AccelAux *)accel_out;
+    AccelAux *accel = reinterpret_cast<AccelAux *>(accel_out);

    DEBUG_PRINTF("accelerations scheme has offset s%u/d%u\n", info.offset,
                 info.double_offset);
+    // cppcheck-suppress redundantInitialization
    accel->generic.offset = verify_u8(info.offset);

    if (double_byte_ok(info) && info.double_cr.none() &&
@ -473,7 +475,8 @@ accel_dfa_build_strat::buildAccel(UNUSED dstate_id_t this_idx,
                u8 c1 = info.double_byte.begin()->first & m1;
                u8 c2 = info.double_byte.begin()->second & m2;
 #ifdef HAVE_SVE2
-                if (vermicelliDoubleMasked16Build(c1, c2, m1, m2, (u8 *)&accel->mdverm16.mask)) {
+                if (vermicelliDoubleMasked16Build(c1, c2, m1, m2,
+                                                  reinterpret_cast<u8 *>(&accel->mdverm16.mask))) {
                    accel->accel_type = ACCEL_DVERM16_MASKED;
                    accel->mdverm16.offset = verify_u8(info.double_offset);
                    accel->mdverm16.c1 = c1;
@ -482,8 +485,9 @@ accel_dfa_build_strat::buildAccel(UNUSED dstate_id_t this_idx,
                                c1, c2);
                    return;
                } else if (info.double_byte.size() <= 8 &&
-                        vermicelliDouble16Build(info.double_byte, (u8 *)&accel->dverm16.mask,
-                                                (u8 *)&accel->dverm16.firsts)) {
+                        vermicelliDouble16Build(info.double_byte,
+                                                reinterpret_cast<u8 *>(&accel->dverm16.mask),
+                                                reinterpret_cast<u8 *>(&accel->dverm16.firsts))) {
                    accel->accel_type = ACCEL_DVERM16;
                    accel->dverm16.offset = verify_u8(info.double_offset);
                    DEBUG_PRINTF("building double16-vermicelli\n");
@ -503,8 +507,9 @@ accel_dfa_build_strat::buildAccel(UNUSED dstate_id_t this_idx,
        }
 #ifdef HAVE_SVE2
        if (info.double_byte.size() <= 8 &&
-            vermicelliDouble16Build(info.double_byte, (u8 *)&accel->dverm16.mask,
-                                    (u8 *)&accel->dverm16.firsts)) {
+            vermicelliDouble16Build(info.double_byte,
+                                    reinterpret_cast<u8 *>(&accel->dverm16.mask),
+                                    reinterpret_cast<u8 *>(&accel->dverm16.firsts))) {
            accel->accel_type = ACCEL_DVERM16;
            accel->dverm16.offset = verify_u8(info.double_offset);
            DEBUG_PRINTF("building double16-vermicelli\n");
@ -515,9 +520,11 @@ accel_dfa_build_strat::buildAccel(UNUSED dstate_id_t this_idx,

    if (double_byte_ok(info) &&
        shuftiBuildDoubleMasks(
-            info.double_cr, info.double_byte, (u8 *)&accel->dshufti.lo1,
-            (u8 *)&accel->dshufti.hi1, (u8 *)&accel->dshufti.lo2,
-            (u8 *)&accel->dshufti.hi2)) {
+            info.double_cr, info.double_byte,
+            reinterpret_cast<u8 *>(&accel->dshufti.lo1),
+            reinterpret_cast<u8 *>(&accel->dshufti.hi1),
+            reinterpret_cast<u8 *>(&accel->dshufti.lo2),
+            reinterpret_cast<u8 *>(&accel->dshufti.hi2))) {
        accel->accel_type = ACCEL_DSHUFTI;
        accel->dshufti.offset = verify_u8(info.double_offset);
        DEBUG_PRINTF("state %hu is double shufti\n", this_idx);
@ -549,7 +556,7 @@ accel_dfa_build_strat::buildAccel(UNUSED dstate_id_t this_idx,
 #ifdef HAVE_SVE2
    if (info.cr.count() <= 16) {
        accel->accel_type = ACCEL_VERM16;
-        vermicelli16Build(info.cr, (u8 *)&accel->verm16.mask);
+        vermicelli16Build(info.cr, reinterpret_cast<u8 *>(&accel->verm16.mask));
        DEBUG_PRINTF("state %hu is vermicelli16\n", this_idx);
        return;
    }
@ -562,16 +569,27 @@ accel_dfa_build_strat::buildAccel(UNUSED dstate_id_t this_idx,
    }

    accel->accel_type = ACCEL_SHUFTI;
-    if (-1 != shuftiBuildMasks(info.cr, (u8 *)&accel->shufti.lo,
-                               (u8 *)&accel->shufti.hi)) {
+    if (-1 != shuftiBuildMasks(info.cr,
+                               reinterpret_cast<u8 *>(&accel->shufti.lo),
+                               reinterpret_cast<u8 *>(&accel->shufti.hi))) {
        DEBUG_PRINTF("state %hu is shufti\n", this_idx);
        return;
    }

    assert(!info.cr.none());
-    accel->accel_type = ACCEL_TRUFFLE;
-    truffleBuildMasks(info.cr, (u8 *)&accel->truffle.mask1,
-                      (u8 *)&accel->truffle.mask2);
+#if defined(CAN_USE_WIDE_TRUFFLE)
+    if(CAN_USE_WIDE_TRUFFLE) {
+        accel->accel_type = ACCEL_TRUFFLE_WIDE;
+        truffleBuildMasksWide(info.cr,
+                              reinterpret_cast<u8 *>(&accel->truffle.mask));
+    } else
+#endif
+    {
+        accel->accel_type = ACCEL_TRUFFLE;
+        truffleBuildMasks(info.cr,
+                        reinterpret_cast<u8 *>(&accel->truffle.mask_lo),
+                        reinterpret_cast<u8 *>(&accel->truffle.mask_hi));
+    }
    DEBUG_PRINTF("state %hu is truffle\n", this_idx);
 }

--- a/src/nfa/accel_dump.cpp
+++ b/src/nfa/accel_dump.cpp
@ -93,6 +93,8 @@ const char *accelName(u8 accel_type) {
        return "double-shufti";
    case ACCEL_TRUFFLE:
        return "truffle";
+    case ACCEL_TRUFFLE_WIDE:
+        return "truffle wide";
    case ACCEL_RED_TAPE:
        return "red tape";
    default:
@ -178,6 +180,13 @@ void dumpTruffleCharReach(FILE *f, const u8 *hiset, const u8 *hiclear) {
            describeClass(cr).c_str());
 }

+static
+void dumpWideTruffleCharReach(FILE *f, const u8 *mask) {
+    CharReach cr = truffle2crWide(mask);
+    fprintf(f, "count %zu class %s\n", cr.count(),
+            describeClass(cr).c_str());
+}
+
 static
 void dumpTruffleMasks(FILE *f, const u8 *hiset, const u8 *hiclear) {
    fprintf(f, "lo %s\n", dumpMask(hiset, 128).c_str());
@ -210,31 +219,38 @@ void dumpAccelInfo(FILE *f, const AccelAux &accel) {
        break;
    case ACCEL_SHUFTI: {
        fprintf(f, "\n");
-        dumpShuftiMasks(f, (const u8 *)&accel.shufti.lo,
-                        (const u8 *)&accel.shufti.hi);
-        dumpShuftiCharReach(f, (const u8 *)&accel.shufti.lo,
-                            (const u8 *)&accel.shufti.hi);
+        dumpShuftiMasks(f, reinterpret_cast<const u8 *>(&accel.shufti.lo),
+                        reinterpret_cast<const u8 *>(&accel.shufti.hi));
+        dumpShuftiCharReach(f, reinterpret_cast<const u8 *>(&accel.shufti.lo),
+                            reinterpret_cast<const u8 *>(&accel.shufti.hi));
        break;
    }
    case ACCEL_DSHUFTI:
        fprintf(f, "\n");
        fprintf(f, "mask 1\n");
-        dumpShuftiMasks(f, (const u8 *)&accel.dshufti.lo1,
-                        (const u8 *)&accel.dshufti.hi1);
+        dumpShuftiMasks(f, reinterpret_cast<const u8 *>(&accel.dshufti.lo1),
+                        reinterpret_cast<const u8 *>(&accel.dshufti.hi1));
        fprintf(f, "mask 2\n");
-        dumpShuftiMasks(f, (const u8 *)&accel.dshufti.lo2,
-                        (const u8 *)&accel.dshufti.hi2);
-        dumpDShuftiCharReach(f, (const u8 *)&accel.dshufti.lo1,
-                             (const u8 *)&accel.dshufti.hi1,
-                             (const u8 *)&accel.dshufti.lo2,
-                             (const u8 *)&accel.dshufti.hi2);
+        dumpShuftiMasks(f, reinterpret_cast<const u8 *>(&accel.dshufti.lo2),
+                        reinterpret_cast<const u8 *>(&accel.dshufti.hi2));
+        dumpDShuftiCharReach(f, reinterpret_cast<const u8 *>(&accel.dshufti.lo1),
+                             reinterpret_cast<const u8 *>(&accel.dshufti.hi1),
+                             reinterpret_cast<const u8 *>(&accel.dshufti.lo2),
+                             reinterpret_cast<const u8 *>(&accel.dshufti.hi2));
        break;
    case ACCEL_TRUFFLE: {
        fprintf(f, "\n");
-        dumpTruffleMasks(f, (const u8 *)&accel.truffle.mask1,
-                         (const u8 *)&accel.truffle.mask2);
-        dumpTruffleCharReach(f, (const u8 *)&accel.truffle.mask1,
-                             (const u8 *)&accel.truffle.mask2);
+        dumpTruffleMasks(f, reinterpret_cast<const u8 *>(&accel.truffle.mask_lo),
+                         reinterpret_cast<const u8 *>(&accel.truffle.mask_hi));
+        dumpTruffleCharReach(f, reinterpret_cast<const u8 *>(&accel.truffle.mask_lo),
+                             reinterpret_cast<const u8 *>(&accel.truffle.mask_hi));
+        break;
+    }
+    case ACCEL_TRUFFLE_WIDE: {
+        fprintf(f, "\n");
+        dumpTruffleMasks(f, reinterpret_cast<const u8 *>(&accel.truffle.mask_lo),
+                         reinterpret_cast<const u8 *>(&accel.truffle.mask_hi));
+        dumpWideTruffleCharReach(f, reinterpret_cast<const u8 *>(&accel.truffle.mask));
        break;
    }
    default:
--- a/src/nfa/accelcompile.cpp
+++ b/src/nfa/accelcompile.cpp
@ -84,8 +84,9 @@ void buildAccelSingle(const AccelInfo &info, AccelAux *aux) {
 #endif

    DEBUG_PRINTF("attempting shufti for %zu chars\n", outs);
-    if (-1 != shuftiBuildMasks(info.single_stops, (u8 *)&aux->shufti.lo,
-                               (u8 *)&aux->shufti.hi)) {
+    if (-1 != shuftiBuildMasks(info.single_stops,
+                               reinterpret_cast<u8 *>(&aux->shufti.lo),
+                               reinterpret_cast<u8 *>(&aux->shufti.hi))) {
        aux->accel_type = ACCEL_SHUFTI;
        aux->shufti.offset = offset;
        DEBUG_PRINTF("shufti built OK\n");
@ -96,10 +97,20 @@ void buildAccelSingle(const AccelInfo &info, AccelAux *aux) {

    if (outs <= ACCEL_MAX_STOP_CHAR) {
        DEBUG_PRINTF("building Truffle for %zu chars\n", outs);
-        aux->accel_type = ACCEL_TRUFFLE;
        aux->truffle.offset = offset;
-        truffleBuildMasks(info.single_stops, (u8 *)&aux->truffle.mask1,
-                          (u8 *)&aux->truffle.mask2);
+#if defined(CAN_USE_WIDE_TRUFFLE)
+        if(CAN_USE_WIDE_TRUFFLE) {
+            aux->accel_type = ACCEL_TRUFFLE_WIDE;
+            truffleBuildMasksWide(info.single_stops,
+                                  reinterpret_cast<u8 *>(&aux->truffle.mask));
+        } else
+#endif
+        {
+            aux->accel_type = ACCEL_TRUFFLE;
+            truffleBuildMasks(info.single_stops,
+                              reinterpret_cast<u8 *>(&aux->truffle.mask_lo),
+                              reinterpret_cast<u8 *>(&aux->truffle.mask_hi));
+        }
        return;
    }

@ -219,8 +230,9 @@ void buildAccelDouble(const AccelInfo &info, AccelAux *aux) {
                             c1, c2);
                return;
            } else if (outs2 <= 8 &&
-                       vermicelliDouble16Build(info.double_stop2, (u8 *)&aux->dverm16.mask,
-                                               (u8 *)&aux->dverm16.firsts)) {
+                       vermicelliDouble16Build(info.double_stop2,
+                                               reinterpret_cast<u8 *>(&aux->dverm16.mask),
+                                               reinterpret_cast<u8 *>(&aux->dverm16.firsts))) {
                aux->accel_type = ACCEL_DVERM16;
                aux->dverm16.offset = offset;
                DEBUG_PRINTF("building double16-vermicelli\n");
@ -254,9 +266,11 @@ void buildAccelDouble(const AccelInfo &info, AccelAux *aux) {
        aux->accel_type = ACCEL_DSHUFTI;
        aux->dshufti.offset = offset;
        if (shuftiBuildDoubleMasks(
-                info.double_stop1, info.double_stop2, (u8 *)&aux->dshufti.lo1,
-                (u8 *)&aux->dshufti.hi1, (u8 *)&aux->dshufti.lo2,
-                (u8 *)&aux->dshufti.hi2)) {
+                info.double_stop1, info.double_stop2,
+                reinterpret_cast<u8 *>(&aux->dshufti.lo1),
+                reinterpret_cast<u8 *>(&aux->dshufti.hi1),
+                reinterpret_cast<u8 *>(&aux->dshufti.lo2),
+                reinterpret_cast<u8 *>(&aux->dshufti.hi2))) {
            return;
        }
    }
--- a/src/nfa/arm/shufti.hpp
+++ b/src/nfa/arm/shufti.hpp
@ -46,7 +46,7 @@ const SuperVector<S> blockSingleMask(SuperVector<S> mask_lo, SuperVector<S> mask

 template <uint16_t S>
 static really_inline
-SuperVector<S> blockDoubleMask(SuperVector<S> mask1_lo, SuperVector<S> mask1_hi, SuperVector<S> mask2_lo, SuperVector<S> mask2_hi, SuperVector<S> chars) {
+SuperVector<S> blockDoubleMask(SuperVector<S> mask1_lo, SuperVector<S> mask1_hi, SuperVector<S> mask2_lo, SuperVector<S> mask2_hi, SuperVector<S> *inout_t1, SuperVector<S> chars) {

    const SuperVector<S> low4bits = SuperVector<S>::dup_u8(0xf);
    SuperVector<S> chars_lo = chars & low4bits;
@ -57,18 +57,25 @@ SuperVector<S> blockDoubleMask(SuperVector<S> mask1_lo, SuperVector<S> mask1_hi,
    c1_lo.print8("c1_lo");
    SuperVector<S> c1_hi = mask1_hi.template pshufb<true>(chars_hi);
    c1_hi.print8("c1_hi");
-    SuperVector<S> t1 = c1_lo | c1_hi;
-    t1.print8("t1");
+    SuperVector<S> new_t1 = c1_lo | c1_hi;
+    // t1 is the match mask for the first char of the patterns
+    new_t1.print8("t1");

    SuperVector<S> c2_lo = mask2_lo.template pshufb<true>(chars_lo);
    c2_lo.print8("c2_lo");
    SuperVector<S> c2_hi = mask2_hi.template pshufb<true>(chars_hi);
    c2_hi.print8("c2_hi");
    SuperVector<S> t2 = c2_lo | c2_hi;
+    // t2 is the match mask for the second char of the patterns
    t2.print8("t2");
-    t2.template vshr_128_imm<1>().print8("t2.vshr_128(1)");
-    SuperVector<S> t = t1 | (t2.template vshr_128_imm<1>());
+
+    // offset t1 so it aligns with t2. The hole created by the offset is filled
+    // with the last elements of the previous t1 so no info is lost.
+    // Bits set to 0 lining up indicate a match.
+    SuperVector<S> t = (new_t1.alignr(*inout_t1, S-1)) | t2;
    t.print8("t");

+    *inout_t1 = new_t1;
+
    return !t.eq(SuperVector<S>::Ones());
 }
--- a/src/nfa/arm/truffle.hpp
+++ b/src/nfa/arm/truffle.hpp
@ -1,6 +1,7 @@
 /*
 * Copyright (c) 2015-2017, Intel Corporation
 * Copyright (c) 2020-2021, VectorCamp PC
+ * Copyright (c) 2023, Arm Limited
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
@ -32,6 +33,204 @@
 *
 */

+#ifdef HAVE_SVE
+#ifdef HAVE_SVE2
+
+/*
+ * blockSingleMask takes in a character set (as masks) and a string and return for each character
+ * of the string wether or not it is part of the set.
+ *
+ * 'shuf_mask_lo_highclear' and 'shuf_mask_lo_highset' are 128-bit masks where each bit
+ * represents whether or not a character is in the character set. The 'highclear' and
+ * 'highset' in the name refers to the MSb of the byte of the character (allowing two
+ * 128-bit masks to cover all 256 values).
+ *
+ * The mask is an array of 32 bytes and is encoded this way:
+ * Let C be a character in the set. The bit describing that character is at byte[C%32] and
+ * within that byte, it's at bit[C/32]
+ * As example, 'a' = 0x61, so the resulting mask will be: 0x00 0x08 0x00 0x00 0x00 ...
+ *
+ * Assume the mask is in one of those configurations:
+ * - both masks are exactly 128b wide
+ * - the first mask is exactly 256b wide and the second is zeroed.
+ * - the first mask is more than 256b wide, with bits past the 256th being zero, and the second mask is zeroed.
+ */
+static really_inline
+svuint8_t blockSingleMaskWideSVE2(svuint8_t shuf_mask_lo_highclear, svuint8_t shuf_mask_lo_highset, svuint8_t chars) {
+    const svuint8_t pshub_mask = svdup_u8(0x1f);
+    const svuint8_t unique_bit_per_lane_mask = svreinterpret_u8(svdup_u64(0x8040201008040201));
+    svuint8x2_t shuf_mask_32 = svcreate2(shuf_mask_lo_highclear, shuf_mask_lo_highset);
+    /*
+     * svtbl2 does a table lookup. Each byte in the second argument indexes into the array of bytes
+     * in shuf_mask_32 and saves the result in the corresponding byte of byte_select.
+     * We mask the chars so that we are using the low nibble of char as the index.
+     */
+    svuint8_t byte_select = svtbl2(shuf_mask_32, svand_x(svptrue_b8(), chars, pshub_mask));
+
+    /*
+     * We now have selected the byte that contain the bit corresponding to the char. We need to
+     * further filter it, otherwise we'd get a match for any character % 32 to a searched character
+     *
+     * The low nibble was used previously to select the byte out of the mask. The high nibble is
+     * used to select the bit out of the byte. So we shift everything right by 5.
+     *
+     * Using svtbl, we can make an array where each element is a different bit. Using the high
+     * nibble we can get a mask selecting only the bit out of a byte that may have the relevant
+     * charset char.
+     */
+    svuint8_t char_high_nibble = svlsr_x(svptrue_b8(), chars, 5);
+    svuint8_t bit_select = svtbl(unique_bit_per_lane_mask, char_high_nibble);
+    /*
+     * We apply the bit_select mask onto the selected byte. What is left is the bit in the charset
+     * encoding the character in char. A non zero value means the char was in the charset
+     *
+     * The _x suffix only works if we process a full char vector. If we were to use a partial
+     * vector, then _z and a mask would be required on this svand only. Otherwise, the disabled
+     * lanes may have arbitrary values
+     */
+    return svand_x(svptrue_b8(), byte_select, bit_select);
+}
+#endif //HAVE_SVE2
+
+/*
+ * blockSingleMask takes in a character set (as masks) and a string and return for each character
+ * of the string wether or not it is part of the set.
+ *
+ * 'shuf_mask_lo_highclear' and 'shuf_mask_lo_highset' are 128-bit masks where each bit
+ * represents whether or not a character is in the character set. The 'highclear' and
+ * 'highset' in the name refers to the MSb of the byte of the character (allowing two
+ * 128-bit masks to cover all 256 values).
+ *
+ * The masks are arrays of 16 bytes each and are encoded this way:
+ * Let C be a character in the set. The bit describing that character is at byte[C%16] and
+ * within that byte, it's at bit[C/16]
+ * As example, 'a' = 0x61, so the resulting mask will be: 0x00 0x40 0x00 0x00 0x00 ...
+ *
+ * Assume both mask are 128b wide. If they are larger, the additional bits must be zero
+ */
+static really_inline
+svuint8_t blockSingleMaskSVE(svuint8_t shuf_mask_lo_highclear, svuint8_t shuf_mask_lo_highset, svuint8_t chars) {
+
+    const svuint8_t highconst = svdup_u8(0x80);
+    const svuint8_t pshub_mask = svdup_u8(0x8f);
+    const svuint8_t unique_bit_per_lane_mask = svreinterpret_u8(svdup_u64(0x8040201008040201));
+
+    /*
+     * svtbl does a table lookup. Each byte in the second argument indexes into the array of bytes
+     * in shuf_mask_lo_highclear and saves the result in the corresponding byte of byte_select_low.
+     * We mask the chars so that we are using the low nibble of char as the index but we keep the
+     * MSb so that high characters (not represented by the highclear mask) become an index out of
+     * bounds and result in a 0.
+     */
+    svuint8_t byte_select_low = svtbl(shuf_mask_lo_highclear, svand_x(svptrue_b8(), chars, pshub_mask));
+
+    /*
+     * We flip the MSb of the chars and do the same table lookup with the highset mask.
+     * This way it's the characters with MSb cleared that will result in out of bands indexes.
+     * This allows us to cover the full range (0-127 and 128-255)
+     */
+    svuint8_t char_high_flipped = sveor_x(svptrue_b8(), chars, highconst);
+    svuint8_t byte_select_high = svtbl(shuf_mask_lo_highset, svand_x(svptrue_b8(), char_high_flipped, pshub_mask));
+
+    /*
+     * We now have selected the byte that contain the bit corresponding to the char. We need to
+     * further filter it, otherwise we'd get a match for any character % 16 to a searched character
+     *
+     * The low nibble was used previously to select the byte out of the mask. The high nibble is
+     * used to select the bit out of the byte. So we shift everything right by 4.
+     *
+     * Using svtbl, we can make an array where each element is a different bit. Using the high
+     * nibble we can get a mask selecting only the bit out of a byte that may have the relevant
+     * charset char.
+     */
+    svuint8_t char_high_nibble = svlsr_x(svptrue_b8(), chars, 4);
+    svuint8_t bit_select = svtbl(unique_bit_per_lane_mask, char_high_nibble);
+    /*
+     * For every lane, only one of the byte selected may have a value, so we can OR them. We
+     * then apply the bit_select mask. What is left is the bit in the charset encoding the
+     * character in char. A non zero value means the char was in the charset
+     *
+     * The _x suffix only works if we process a full char vector. If we were to use a partial
+     * vector, then _z and a mask would be required on this svand only. Otherwise, the disabled
+     * lanes may have arbitrary values
+     */
+    return svand_x(svptrue_b8(), svorr_x(svptrue_b8(), byte_select_low, byte_select_high), bit_select);
+}
+
+/*
+ * blockSingleMask takes in a character set (as masks) and a string and return for each character
+ * of the string wether or not it is part of the set.
+ *
+ * 'shuf_mask_32' is a 256-bit masks where each bit represents whether or not a character is in
+ * the character set.
+ *
+ * The mask is an array of 32 bytes and is encoded this way:
+ * Let C be a character in the set. The bit describing that character is at byte[C%32] and
+ * within that byte, it's at bit[C/32]
+ * As example, 'a' = 0x61, so the resulting mask will be: 0x00 0x08 0x00 0x00 0x00 ...
+ *
+ * Assume both mask are 128b wide. If they are larger, the additional bits must be zero
+ */
+static really_inline
+svuint8_t blockSingleMaskWideSVE(svuint8_t shuf_mask_32, svuint8_t chars) {//TODO I might have issues with the type
+
+    const svuint8_t pshub_mask = svdup_u8(0x1f);
+    const svuint8_t unique_bit_per_lane_mask = svreinterpret_u8(svdup_u64(0x8040201008040201));
+
+    /*
+     * svtbl does a table lookup. Each byte in the second argument indexes into the array of bytes
+     * in shuf_mask_32 and saves the result in the corresponding byte of byte_select.
+     * We mask the chars so that we are using the low nibble of char as the index.
+     */
+    svuint8_t byte_select = svtbl(shuf_mask_32, svand_x(svptrue_b8(), chars, pshub_mask));
+
+    /*
+     * We now have selected the byte that contain the bit corresponding to the char. We need to
+     * further filter it, otherwise we'd get a match for any character % 32 to a searched character
+     *
+     * The low nibble was used previously to select the byte out of the mask. The high nibble is
+     * used to select the bit out of the byte. So we shift everything right by 5.
+     *
+     * Using svtbl, we can make an array where each element is a different bit. Using the high
+     * nibble we can get a mask selecting only the bit out of a byte that may have the relevant
+     * charset char.
+     */
+    svuint8_t char_high_nibble = svlsr_x(svptrue_b8(), chars, 5);
+    svuint8_t bit_select = svtbl(unique_bit_per_lane_mask, char_high_nibble);
+    /*
+     * We apply the bit_select mask onto the selected byte. What is left is the bit in the charset
+     * encoding the character in char. A non zero value means the char was in the charset
+     *
+     * The _x suffix only works if we process a full char vector. If we were to use a partial
+     * vector, then _z and a mask would be required on this svand only. Otherwise, the disabled
+     * lanes may have arbitrary values
+     */
+    return svand_x(svptrue_b8(), byte_select, bit_select);
+}
+
+/* require normal truffle compilation. The 256b mask is split between the two parameters */
+static really_inline
+svuint8_t blockSingleMask(svuint8_t shuf_mask_lo_highclear, svuint8_t shuf_mask_lo_highset, svuint8_t chars) {
+    return blockSingleMaskSVE(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
+}
+
+/* require wide truffle compilation. The 256b mask is fully contained in the first parameter */
+static really_inline
+svuint8_t blockSingleMaskWide32(svuint8_t shuf_mask_32, svuint8_t chars) {
+    return blockSingleMaskWideSVE(shuf_mask_32, chars);
+}
+
+#ifdef HAVE_SVE2
+/* require wide truffle compilation. The 256b mask is split between the two parameters if the vector is 128b,
+ * or fully contained in the first parameter is it's 256b and more*/
+static really_inline
+svuint8_t blockSingleMaskWide(svuint8_t shuf_mask_lo_highclear, svuint8_t shuf_mask_lo_highset, svuint8_t chars) {
+    return blockSingleMaskWideSVE2(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
+}
+#endif //HAVE_SVE2
+#endif //HAVE_SVE
+
+/* require normal truffle compilation. The 256b mask is split between the two parameters */
 template <uint16_t S>
 static really_inline
 const SuperVector<S> blockSingleMask(SuperVector<S> shuf_mask_lo_highclear, SuperVector<S> shuf_mask_lo_highset, SuperVector<S> chars) {
--- a/src/nfa/castle.c
+++ b/src/nfa/castle.c
@ -94,8 +94,8 @@ char subCastleReportCurrent(const struct Castle *c, struct mq *q,
    const struct SubCastle *sub = getSubCastle(c, subIdx);
    const struct RepeatInfo *info = getRepeatInfo(sub);

-    union RepeatControl *rctrl = getControl(q->state, sub);
-    char *rstate = (char *)q->streamState + sub->streamStateOffset +
+    const union RepeatControl *rctrl = getControl(q->state, sub);
+    const char *rstate = (char *)q->streamState + sub->streamStateOffset +
                   info->packedCtrlSize;
    enum RepeatMatch match =
        repeatHasMatch(info, rctrl, rstate, offset);
@ -118,10 +118,10 @@ int castleReportCurrent(const struct Castle *c, struct mq *q) {

    if (c->exclusive) {
        u8 *active = (u8 *)q->streamState;
-        u8 *groups = active + c->groupIterOffset;
+        const u8 *groups = active + c->groupIterOffset;
        for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
             i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
-            u8 *cur = active + i * c->activeIdxSize;
+            const u8 *cur = active + i * c->activeIdxSize;
            const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
            DEBUG_PRINTF("subcastle %u\n", activeIdx);
            if (subCastleReportCurrent(c, q,
@ -156,8 +156,8 @@ char subCastleInAccept(const struct Castle *c, struct mq *q,
    }
    const struct RepeatInfo *info = getRepeatInfo(sub);

-    union RepeatControl *rctrl = getControl(q->state, sub);
-    char *rstate = (char *)q->streamState + sub->streamStateOffset +
+    const union RepeatControl *rctrl = getControl(q->state, sub);
+    const char *rstate = (char *)q->streamState + sub->streamStateOffset +
                   info->packedCtrlSize;
    enum RepeatMatch match =
        repeatHasMatch(info, rctrl, rstate, offset);
@ -180,10 +180,10 @@ char castleInAccept(const struct Castle *c, struct mq *q,

    if (c->exclusive) {
        u8 *active = (u8 *)q->streamState;
-        u8 *groups = active + c->groupIterOffset;
+        const u8 *groups = active + c->groupIterOffset;
        for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
             i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
-            u8 *cur = active + i * c->activeIdxSize;
+            const u8 *cur = active + i * c->activeIdxSize;
            const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
            DEBUG_PRINTF("subcastle %u\n", activeIdx);
            if (subCastleInAccept(c, q, report, offset, activeIdx)) {
@ -213,8 +213,8 @@ void subCastleDeactivateStaleSubs(const struct Castle *c, const u64a offset,
    const struct SubCastle *sub = getSubCastle(c, subIdx);
    const struct RepeatInfo *info = getRepeatInfo(sub);

-    union RepeatControl *rctrl = getControl(full_state, sub);
-    char *rstate = (char *)stream_state + sub->streamStateOffset +
+    const union RepeatControl *rctrl = getControl(full_state, sub);
+    const char *rstate = (char *)stream_state + sub->streamStateOffset +
                       info->packedCtrlSize;

    if (repeatHasMatch(info, rctrl, rstate, offset) == REPEAT_STALE) {
@ -242,10 +242,10 @@ void castleDeactivateStaleSubs(const struct Castle *c, const u64a offset,

    if (c->exclusive) {
        u8 *active = (u8 *)stream_state;
-        u8 *groups = active + c->groupIterOffset;
+        const u8 *groups = active + c->groupIterOffset;
        for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
             i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
-            u8 *cur = active + i * c->activeIdxSize;
+            const u8 *cur = active + i * c->activeIdxSize;
            const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
            DEBUG_PRINTF("subcastle %u\n", activeIdx);
            subCastleDeactivateStaleSubs(c, offset, full_state,
@ -329,8 +329,8 @@ void subCastleFindMatch(const struct Castle *c, const u64a begin,
                        size_t *mloc, char *found, const u32 subIdx) {
    const struct SubCastle *sub = getSubCastle(c, subIdx);
    const struct RepeatInfo *info = getRepeatInfo(sub);
-    union RepeatControl *rctrl = getControl(full_state, sub);
-    char *rstate = (char *)stream_state + sub->streamStateOffset +
+    const union RepeatControl *rctrl = getControl(full_state, sub);
+    const char *rstate = (char *)stream_state + sub->streamStateOffset +
                   info->packedCtrlSize;

    u64a match = repeatNextMatch(info, rctrl, rstate, begin);
@ -374,10 +374,10 @@ char castleFindMatch(const struct Castle *c, const u64a begin, const u64a end,

    if (c->exclusive) {
        u8 *active = (u8 *)stream_state;
-        u8 *groups = active + c->groupIterOffset;
+        const u8 *groups = active + c->groupIterOffset;
        for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
             i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
-            u8 *cur = active + i * c->activeIdxSize;
+            const u8 *cur = active + i * c->activeIdxSize;
            const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
            DEBUG_PRINTF("subcastle %u\n", activeIdx);
            subCastleFindMatch(c, begin, end, full_state, stream_state, mloc,
@ -386,7 +386,7 @@ char castleFindMatch(const struct Castle *c, const u64a begin, const u64a end,
    }

    if (c->exclusive != PURE_EXCLUSIVE) {
-        u8 *active = (u8 *)stream_state + c->activeOffset;
+        const u8 *active = (u8 *)stream_state + c->activeOffset;
        for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID);
             i != MMB_INVALID;
             i = mmbit_iterate(active, c->numRepeats, i)) {
@ -400,8 +400,8 @@ char castleFindMatch(const struct Castle *c, const u64a begin, const u64a end,
 }

 static really_inline
-u64a subCastleNextMatch(const struct Castle *c, void *full_state,
-                        void *stream_state, const u64a loc,
+u64a subCastleNextMatch(const struct Castle *c, const void *full_state,
+                        const void *stream_state, const u64a loc,
                        const u32 subIdx) {
    DEBUG_PRINTF("subcastle %u\n", subIdx);
    const struct SubCastle *sub = getSubCastle(c, subIdx);
@ -489,15 +489,14 @@ char castleMatchLoop(const struct Castle *c, const u64a begin, const u64a end,
        // full_state (scratch).

        u64a offset = end; // min offset of next match
-        u32 activeIdx = 0;
        mmbit_clear(matching, c->numRepeats);
        if (c->exclusive) {
            u8 *active = (u8 *)stream_state;
            u8 *groups = active + c->groupIterOffset;
            for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
                 i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
-                u8 *cur = active + i * c->activeIdxSize;
-                activeIdx = partial_load_u32(cur, c->activeIdxSize);
+                const u8 *cur = active + i * c->activeIdxSize;
+                u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
                u64a match = subCastleNextMatch(c, full_state, stream_state,
                                                loc, activeIdx);
                set_matching(c, match, groups, matching, c->numGroups, i,
@ -797,7 +796,7 @@ char nfaExecCastle_Q_i(const struct NFA *n, struct mq *q, s64a end,

        char found = 0;
        if (c->exclusive) {
-            u8 *groups = (u8 *)q->streamState + c->groupIterOffset;
+            const u8 *groups = (u8 *)q->streamState + c->groupIterOffset;
            found = mmbit_any(groups, c->numGroups);
        }

@ -864,7 +863,7 @@ char nfaExecCastle_Q_i(const struct NFA *n, struct mq *q, s64a end,
    }

    if (c->exclusive) {
-        u8 *groups = (u8 *)q->streamState + c->groupIterOffset;
+        const u8 *groups = (u8 *)q->streamState + c->groupIterOffset;
        if (mmbit_any_precise(groups, c->numGroups)) {
            return 1;
        }
@ -884,7 +883,7 @@ char nfaExecCastle_Q2(const struct NFA *n, struct mq *q, s64a end) {
 }

 static
-s64a castleLastKillLoc(const struct Castle *c, struct mq *q) {
+s64a castleLastKillLoc(const struct Castle *c, const struct mq *q) {
    assert(q_cur_type(q) == MQE_START);
    assert(q_last_type(q) == MQE_END);
    s64a sp = q_cur_loc(q);
@ -907,7 +906,6 @@ s64a castleLastKillLoc(const struct Castle *c, struct mq *q) {
        if (castleRevScan(c, q->history, sp + hlen, ep + hlen, &loc)) {
            return (s64a)loc - hlen;
        }
-        ep = 0;
    }

    return sp - 1; /* the repeats are never killed */
@ -959,7 +957,7 @@ char nfaExecCastle_QR(const struct NFA *n, struct mq *q, ReportID report) {

    char found = 0;
    if (c->exclusive) {
-        u8 *groups = (u8 *)q->streamState + c->groupIterOffset;
+        const u8 *groups = (u8 *)q->streamState + c->groupIterOffset;
        found = mmbit_any_precise(groups, c->numGroups);

    }
@ -1007,10 +1005,10 @@ char nfaExecCastle_inAnyAccept(const struct NFA *n, struct mq *q) {

    if (c->exclusive) {
        u8 *active = (u8 *)q->streamState;
-        u8 *groups = active + c->groupIterOffset;
+        const u8 *groups = active + c->groupIterOffset;
        for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
             i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
-            u8 *cur = active + i * c->activeIdxSize;
+            const u8 *cur = active + i * c->activeIdxSize;
            const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
            DEBUG_PRINTF("subcastle %u\n", activeIdx);
            const struct SubCastle *sub = getSubCastle(c, activeIdx);
@ -1079,7 +1077,7 @@ void subCastleQueueCompressState(const struct Castle *c, const u32 subIdx,
                                 const struct mq *q, const u64a offset) {
    const struct SubCastle *sub = getSubCastle(c, subIdx);
    const struct RepeatInfo *info = getRepeatInfo(sub);
-    union RepeatControl *rctrl = getControl(q->state, sub);
+    const union RepeatControl *rctrl = getControl(q->state, sub);
    char *packed = (char *)q->streamState + sub->streamStateOffset;
    DEBUG_PRINTF("sub %u next match %llu\n", subIdx,
                 repeatNextMatch(info, rctrl,
@ -1100,10 +1098,10 @@ char nfaExecCastle_queueCompressState(const struct NFA *n, const struct mq *q,
    DEBUG_PRINTF("offset=%llu\n", offset);
    if (c->exclusive) {
        u8 *active = (u8 *)q->streamState;
-        u8 *groups = active + c->groupIterOffset;
+        const u8 *groups = active + c->groupIterOffset;
        for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID);
             i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) {
-            u8 *cur = active + i * c->activeIdxSize;
+            const u8 *cur = active + i * c->activeIdxSize;
            const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize);
            DEBUG_PRINTF("packing state for sub %u\n", activeIdx);
            subCastleQueueCompressState(c, activeIdx, q, offset);
--- a/src/nfa/castle_dump.cpp
+++ b/src/nfa/castle_dump.cpp
@ -56,7 +56,7 @@ namespace ue2 {
 static
 void dumpTextSubCastle(const SubCastle &sub, FILE *f) {
    const RepeatInfo *info =
-        (const RepeatInfo *)((const char *)&sub + sub.repeatInfoOffset);
+        reinterpret_cast<const RepeatInfo *>(reinterpret_cast<const char *>(&sub) + sub.repeatInfoOffset);
    fprintf(f, "  repeat model:          %s\n", repeatTypeName(info->type));
    fprintf(f, "  repeat bounds:         {%u, %u}\n", info->repeatMin,
            info->repeatMax);
@ -69,7 +69,7 @@ void dumpTextSubCastle(const SubCastle &sub, FILE *f) {
 }

 void nfaExecCastle_dump(const struct NFA *nfa, const string &base) {
-    const Castle *c = (const Castle *)getImplNfa(nfa);
+    const Castle *c = reinterpret_cast<const Castle *>(getImplNfa(nfa));

    StdioFile f(base + ".txt", "w");

@ -88,15 +88,15 @@ void nfaExecCastle_dump(const struct NFA *nfa, const string &base) {
        fprintf(f, "negated verm, scanning for 0x%02x\n", c->u.verm.c);
        break;
    case CASTLE_SHUFTI: {
-        const CharReach cr = shufti2cr((const u8 *)&c->u.shuf.mask_lo,
-                                       (const u8 *)&c->u.shuf.mask_hi);
+        const CharReach cr = shufti2cr(reinterpret_cast<const u8 *>(&c->u.shuf.mask_lo),
+                                       reinterpret_cast<const u8 *>(&c->u.shuf.mask_hi));
        fprintf(f, "shufti, scanning for %s (%zu chars)\n",
                describeClass(cr).c_str(), cr.count());
        break;
    }
    case CASTLE_TRUFFLE: {
-        const CharReach cr = truffle2cr((const u8 *)&c->u.truffle.mask1,
-                                        (const u8 *)&c->u.truffle.mask2);
+        const CharReach cr = truffle2cr(reinterpret_cast<const u8 *>(&c->u.truffle.mask1),
+                                        reinterpret_cast<const u8 *>(&c->u.truffle.mask2));
        fprintf(f, "truffle, scanning for %s (%zu chars)\n",
                describeClass(cr).c_str(), cr.count());
        break;
@ -112,7 +112,7 @@ void nfaExecCastle_dump(const struct NFA *nfa, const string &base) {
    fprintf(f, "\n");

    const SubCastle *sub =
-        (const SubCastle *)((const char *)c + sizeof(Castle));
+        reinterpret_cast<const SubCastle *>(reinterpret_cast<const char *>(c) + sizeof(Castle));
    for (u32 i = 0; i < c->numRepeats; i++) {
        fprintf(f, "Sub %u:\n", i);
        dumpTextSubCastle(sub[i], f);
--- a/src/nfa/castlecompile.cpp
+++ b/src/nfa/castlecompile.cpp
@ -106,25 +106,27 @@ void writeCastleScanEngine(const CharReach &cr, Castle *c) {
 #ifdef HAVE_SVE2
    if (cr.count() <= 16) {
        c->type = CASTLE_NVERM16;
-        vermicelli16Build(cr, (u8 *)&c->u.verm16.mask);
+        vermicelli16Build(cr, reinterpret_cast<u8 *>(&c->u.verm16.mask));
        return;
    }
    if (negated.count() <= 16) {
        c->type = CASTLE_VERM16;
-        vermicelli16Build(negated, (u8 *)&c->u.verm16.mask);
+        vermicelli16Build(negated, reinterpret_cast<u8 *>(&c->u.verm16.mask));
        return;
    }
 #endif // HAVE_SVE2

-    if (shuftiBuildMasks(negated, (u8 *)&c->u.shuf.mask_lo,
-                         (u8 *)&c->u.shuf.mask_hi) != -1) {
+    if (shuftiBuildMasks(negated,
+                         reinterpret_cast<u8 *>(&c->u.shuf.mask_lo),
+                         reinterpret_cast<u8 *>(&c->u.shuf.mask_hi)) != -1) {
        c->type = CASTLE_SHUFTI;
        return;
    }

    c->type = CASTLE_TRUFFLE;
-    truffleBuildMasks(negated, (u8 *)(u8 *)&c->u.truffle.mask1,
-                      (u8 *)&c->u.truffle.mask2);
+    truffleBuildMasks(negated,
+                      reinterpret_cast<u8 *>(&c->u.truffle.mask1),
+                      reinterpret_cast<u8 *>(&c->u.truffle.mask2));
 }

 static
@ -227,11 +229,13 @@ vector<u32> removeClique(CliqueGraph &cg) {
    while (!graph_empty(cg)) {
        const vector<u32> &c = cliquesVec.back();
        vector<CliqueVertex> dead;
-        for (const auto &v : vertices_range(cg)) {
-            if (find(c.begin(), c.end(), cg[v].stateId) != c.end()) {
-                dead.emplace_back(v);
-            }
-        }
+
+        auto deads = [&c=c, &cg=cg](const CliqueVertex &v) {
+            return (find(c.begin(), c.end(), cg[v].stateId) != c.end());
+        };
+        const auto &vr = vertices_range(cg);
+        std::copy_if(begin(vr), end(vr),  std::back_inserter(dead), deads);
+
        for (const auto &v : dead) {
            clear_vertex(v, cg);
            remove_vertex(v, cg);
@ -294,7 +298,7 @@ vector<vector<u32>> checkExclusion(u32 &streamStateSize,
    size_t lower = 0;
    size_t total = 0;
    while (lower < trigSize) {
-        vector<CliqueVertex> vertices;
+        vector<CliqueVertex> clvertices;
        unique_ptr<CliqueGraph> cg = make_unique<CliqueGraph>();

        vector<vector<size_t>> min_reset_dist;
@ -302,7 +306,7 @@ vector<vector<u32>> checkExclusion(u32 &streamStateSize,
        // get min reset distance for each repeat
        for (size_t i = lower; i < upper; i++) {
            CliqueVertex v = add_vertex(CliqueVertexProps(i), *cg);
-            vertices.emplace_back(v);
+            clvertices.emplace_back(v);

            const vector<size_t> &tmp_dist =
                minResetDistToEnd(triggers[i], cr);
@ -311,11 +315,11 @@ vector<vector<u32>> checkExclusion(u32 &streamStateSize,

        // find exclusive pair for each repeat
        for (size_t i = lower; i < upper; i++) {
-            CliqueVertex s = vertices[i - lower];
+            CliqueVertex s = clvertices[i - lower];
            for (size_t j = i + 1; j < upper; j++) {
                if (findExclusivePair(i, j, lower, min_reset_dist,
                                      triggers)) {
-                    CliqueVertex d = vertices[j - lower];
+                    CliqueVertex d = clvertices[j - lower];
                    add_edge(s, d, *cg);
                }
            }
@ -600,9 +604,9 @@ buildCastle(const CastleProto &proto,
    nfa->minWidth = verify_u32(minWidth);
    nfa->maxWidth = maxWidth.is_finite() ? verify_u32(maxWidth) : 0;

-    char * const base_ptr = (char *)nfa.get() + sizeof(NFA);
+    char * const base_ptr = reinterpret_cast<char *>(nfa.get()) + sizeof(NFA);
    char *ptr = base_ptr;
-    Castle *c = (Castle *)ptr;
+    Castle *c = reinterpret_cast<Castle *>(ptr);
    c->numRepeats = verify_u32(subs.size());
    c->numGroups = exclusiveInfo.numGroups;
    c->exclusive = verify_s8(exclusive);
@ -613,7 +617,7 @@ buildCastle(const CastleProto &proto,
    writeCastleScanEngine(cr, c);

    ptr += sizeof(Castle);
-    SubCastle *subCastles = ((SubCastle *)(ROUNDUP_PTR(ptr, alignof(u32))));
+    SubCastle *subCastles = reinterpret_cast<SubCastle *>(ROUNDUP_PTR(ptr, alignof(u32)));
    copy(subs.begin(), subs.end(), subCastles);

    u32 length = 0;
@ -623,16 +627,17 @@ buildCastle(const CastleProto &proto,
        SubCastle *sub = &subCastles[i];
        sub->repeatInfoOffset = offset;

-        ptr = (char *)sub + offset;
+        ptr = reinterpret_cast<char *>(sub) + offset;
        memcpy(ptr, &infos[i], sizeof(RepeatInfo));

        if (patchSize[i]) {
-            RepeatInfo *info = (RepeatInfo *)ptr;
-            u64a *table = ((u64a *)(ROUNDUP_PTR(((char *)(info) +
-                                    sizeof(*info)), alignof(u64a))));
+            RepeatInfo *info = reinterpret_cast<RepeatInfo *>(ptr);
+	    char *info_base = reinterpret_cast<char *>(info);
+            u64a *table = reinterpret_cast<u64a *>(ROUNDUP_PTR(info_base +
+                                    sizeof(*info), alignof(u64a)));
            copy(tables.begin() + tableIdx,
                 tables.begin() + tableIdx + patchSize[i], table);
-            u32 diff = (char *)table - (char *)info +
+            u32 diff = reinterpret_cast<char *>(table) - info_base +
                       sizeof(u64a) * patchSize[i];
            info->length = diff;
            length += diff;
@ -655,7 +660,6 @@ buildCastle(const CastleProto &proto,
    if (!stale_iter.empty()) {
        c->staleIterOffset = verify_u32(ptr - base_ptr);
        copy_bytes(ptr, stale_iter);
-        ptr += byte_length(stale_iter);
    }

    return nfa;
@ -672,6 +676,7 @@ set<ReportID> all_reports(const CastleProto &proto) {
 depth findMinWidth(const CastleProto &proto) {
    depth min_width(depth::infinity());
    for (const PureRepeat &pr : proto.repeats | map_values) {
+        // cppcheck-suppress useStlAlgorithm
        min_width = min(min_width, pr.bounds.min);
    }
    return min_width;
@ -680,6 +685,7 @@ depth findMinWidth(const CastleProto &proto) {
 depth findMaxWidth(const CastleProto &proto) {
    depth max_width(0);
    for (const PureRepeat &pr : proto.repeats | map_values) {
+        // cppcheck-suppress useStlAlgorithm
        max_width = max(max_width, pr.bounds.max);
    }
    return max_width;
@ -746,6 +752,7 @@ u32 CastleProto::merge(const PureRepeat &pr) {

    // First, see if this repeat is already in this castle.
    for (const auto &m : repeats) {
+        // cppcheck-suppress useStlAlgorithm
        if (m.second == pr) {
            DEBUG_PRINTF("repeat already present, with top %u\n", m.first);
            return m.first;
@ -919,7 +926,7 @@ void addToHolder(NGHolder &g, u32 top, const PureRepeat &pr) {
    u32 min_bound = pr.bounds.min; // always finite
    if (min_bound == 0) { // Vacuous case, we can only do this once.
        assert(!edge(g.start, g.accept, g).second);
-        NFAEdge e = add_edge(g.start, g.accept, g);
+        NFAEdge e = add_edge(g.start, g.accept, g).first;
        g[e].tops.insert(top);
        g[u].reports.insert(pr.reports.begin(), pr.reports.end());
        min_bound = 1;
@ -928,7 +935,7 @@ void addToHolder(NGHolder &g, u32 top, const PureRepeat &pr) {
    for (u32 i = 0; i < min_bound; i++) {
        NFAVertex v = add_vertex(g);
        g[v].char_reach = pr.reach;
-        NFAEdge e = add_edge(u, v, g);
+        NFAEdge e = add_edge(u, v, g).first;
        if (u == g.start) {
            g[e].tops.insert(top);
        }
@ -947,7 +954,7 @@ void addToHolder(NGHolder &g, u32 top, const PureRepeat &pr) {
            if (head != u) {
                add_edge(head, v, g);
            }
-            NFAEdge e = add_edge(u, v, g);
+            NFAEdge e = add_edge(u, v, g).first;
            if (u == g.start) {
                g[e].tops.insert(top);
            }
@ -970,6 +977,7 @@ void addToHolder(NGHolder &g, u32 top, const PureRepeat &pr) {
 static
 bool hasZeroMinBound(const CastleProto &proto) {
    const depth zero(0);
+    // cppcheck-suppress useStlAlgorithm
    for (const PureRepeat &pr : proto.repeats | map_values) {
        if (pr.bounds.min == zero) {
            return true;
--- a/src/nfa/dfa_min.cpp
+++ b/src/nfa/dfa_min.cpp
@ -263,6 +263,7 @@ void mapping_new_states(const HopcroftInfo &info,
    new_states.reserve(num_partitions);

    for (const auto &m : ordering) {
+        // cppcheck-suppress useStlAlgorithm    
        new_states.emplace_back(rdfa.states[m.first]);
    }
    rdfa.states = std::move(new_states);
@ -304,6 +305,7 @@ void minimize_hopcroft(raw_dfa &rdfa, const Grey &grey) {
        DEBUG_PRINTF("dfa is empty\n");
    }

+    // cppcheck-suppress unreadVariable
    UNUSED const size_t states_before = rdfa.states.size();

    HopcroftInfo info(rdfa);
--- a/src/nfa/gough.c
+++ b/src/nfa/gough.c
@ -978,14 +978,14 @@ char nfaExecGough16_initCompressedState(const struct NFA *nfa, u64a offset,
 char nfaExecGough8_reportCurrent(const struct NFA *n, struct mq *q) {
    const struct mcclellan *m = (const struct mcclellan *)getImplNfa(n);
    NfaCallback cb = q->cb;
-    void *ctxt = q->context;
    u8 s = *(u8 *)q->state;
    u64a offset = q_cur_offset(q);
-    struct gough_som_info *som = getSomInfo(q->state);
+    const struct gough_som_info *som = getSomInfo(q->state);
    assert(q_cur_type(q) == MQE_START);
    assert(s);

    if (s >= m->accept_limit_8) {
+        void *ctxt = q->context;
        u32 cached_accept_id = 0;
        u16 cached_accept_state = 0;
        u32 cached_accept_som = 0;
@ -1000,16 +1000,16 @@ char nfaExecGough8_reportCurrent(const struct NFA *n, struct mq *q) {
 char nfaExecGough16_reportCurrent(const struct NFA *n, struct mq *q) {
    const struct mcclellan *m = (const struct mcclellan *)getImplNfa(n);
    NfaCallback cb = q->cb;
-    void *ctxt = q->context;
    u16 s = *(u16 *)q->state;
    const struct mstate_aux *aux = get_aux(m, s);
    u64a offset = q_cur_offset(q);
-    struct gough_som_info *som = getSomInfo(q->state);
+    const struct gough_som_info *som = getSomInfo(q->state);
    assert(q_cur_type(q) == MQE_START);
    DEBUG_PRINTF("state %hu\n", s);
    assert(s);

    if (aux->accept) {
+        void *ctxt = q->context;
        u32 cached_accept_id = 0;
        u16 cached_accept_state = 0;
        u32 cached_accept_som = 0;
--- a/src/nfa/gough_internal.h
+++ b/src/nfa/gough_internal.h
@ -92,6 +92,7 @@ struct gough_info {
 static really_inline
 const struct gough_info *get_gough(const struct mcclellan *m) {
    assert(m->haig_offset);
+    // cppcheck-suppress cstyleCast
    const char *n = (const char *)m - sizeof(struct NFA);
    return (const struct gough_info *)(n + m->haig_offset);
 }
@ -102,6 +103,7 @@ const u32 *get_gough_top_offsets(const struct mcclellan *m) {
    if (!g->top_prog_offset) {
        return NULL;
    }
+    // cppcheck-suppress cstyleCast
    const char *n = (const char *)m - sizeof(struct NFA);
    return (const u32 *)(n + g->top_prog_offset);
 }
--- a/src/nfa/goughcompile.cpp
+++ b/src/nfa/goughcompile.cpp
@ -132,7 +132,7 @@ void GoughSSAVarMin::replace_input(GoughSSAVar *old_v, GoughSSAVar *new_v) {
 }

 static
-void translateRawReports(UNUSED GoughGraph &cfg, UNUSED const raw_som_dfa &raw,
+void translateRawReports(UNUSED const GoughGraph &cfg, UNUSED const raw_som_dfa &raw,
                         const flat_map<u32, GoughSSAVarJoin *> &joins_at_s,
                         UNUSED GoughVertex s,
                         const set<som_report> &reports_in,
@ -206,10 +206,6 @@ void makeCFG_top_edge(GoughGraph &cfg, const vector<GoughVertex> &vertices,
            assert(contains(src_slots, slot_id));

            shared_ptr<GoughSSAVarMin> vmin = make_shared<GoughSSAVarMin>();
-            if (!vmin) {
-                assert(0);
-                throw std::bad_alloc();
-            }
            cfg[e].vars.emplace_back(vmin);
            final_var = vmin.get();

@ -321,10 +317,6 @@ void makeCFG_edge(GoughGraph &cfg, const map<u32, u32> &som_creators,
            DEBUG_PRINTF("bypassing min on join %u\n", slot_id);
        } else {
            shared_ptr<GoughSSAVarMin> vmin = make_shared<GoughSSAVarMin>();
-            if (!vmin) {
-                assert(0);
-                throw std::bad_alloc();
-            }
            cfg[e].vars.emplace_back(vmin);
            final_var = vmin.get();

@ -441,10 +433,11 @@ unique_ptr<GoughGraph> makeCFG(const raw_som_dfa &raw) {
 }

 static
+// cppcheck-suppress constParameterReference
 void copy_propagate_report_set(vector<pair<ReportID, GoughSSAVar *> > &rep) {
    vector<pair<ReportID, GoughSSAVar *> >::iterator it = rep.begin();
    while (it != rep.end()) {
-        GoughSSAVar *var = it->second;
+        const GoughSSAVar *var = it->second;
        if (!var) {
            ++it;
            continue;
@ -546,7 +539,7 @@ void remove_dead(GoughGraph &g) {
    }

    while (!queue.empty()) {
-        GoughSSAVar *v = queue.back();
+        const GoughSSAVar *v = queue.back();
        queue.pop_back();
        for (GoughSSAVar *var : v->get_inputs()) {
            if (var->seen) {
@ -602,6 +595,7 @@ void GoughSSAVarNew::generate(vector<gough_ins> *out) const {
 #ifndef NDEBUG
 template<typename C, typename K>
 bool contains_loose(const C &container, const K &key) {
+    // cppcheck-suppress useStlAlgorithm
    for (const auto &elem : container) {
        if (elem == key) {
            return true;
@ -650,6 +644,7 @@ void GoughSSAVarJoin::generate(UNUSED vector<gough_ins> *out) const {

 GoughSSAVar *GoughSSAVarJoin::get_input(const GoughEdge &prev) const {
    for (const auto &var_edge : input_map) {
+        // cppcheck-suppress useStlAlgorithm
        if (contains(var_edge.second, prev)) {
            return var_edge.first;
        }
@ -658,8 +653,8 @@ GoughSSAVar *GoughSSAVarJoin::get_input(const GoughEdge &prev) const {
    return nullptr;
 }

-const flat_set<GoughEdge> &GoughSSAVarJoin::get_edges_for_input(
-                                                 GoughSSAVar *input) const {
+// cppcheck-suppress constParameterPointer
+const flat_set<GoughEdge> &GoughSSAVarJoin::get_edges_for_input(GoughSSAVar *input) const {
    return input_map.at(input);
 }

@ -810,7 +805,7 @@ private:

 static
 void prep_joins_for_generation(const GoughGraph &g, GoughVertex v,
-                               map<GoughEdge, edge_join_info> *edge_info) {
+                               map<GoughEdge, edge_join_info> &edge_info) {
    DEBUG_PRINTF("writing out joins for %u\n", g[v].state_id);
    for (const auto &var : g[v].vars) {
        u32 dest_slot = var->slot;
@ -821,7 +816,7 @@ void prep_joins_for_generation(const GoughGraph &g, GoughVertex v,
            }

            for (const GoughEdge &incoming_edge : var_edges.second) {
-                (*edge_info)[incoming_edge].insert(input, dest_slot);
+                edge_info[incoming_edge].insert(input, dest_slot);
                DEBUG_PRINTF("need %u<-%u\n", dest_slot, input);
            }
        }
@ -919,7 +914,7 @@ void build_blocks(const GoughGraph &g,
        }

        map<GoughEdge, edge_join_info> eji;
-        prep_joins_for_generation(g, t, &eji);
+        prep_joins_for_generation(g, t, eji);

        for (auto &m : eji) {
            vector<gough_ins> &block = (*blocks)[gough_edge_id(g, m.first)];
@ -1017,7 +1012,7 @@ void update_accel_prog_offset(const gough_build_strat &gbs,
        verts[gbs.gg[v].state_id] = v;
    }

-    for (auto &m : gbs.built_accel) {
+    for (const auto &m : gbs.built_accel) {
        gough_accel *ga = m.first;
        assert(!ga->prog_offset);
        GoughVertex v = verts[m.second];
@ -1050,7 +1045,7 @@ bytecode_ptr<NFA> goughCompile(raw_som_dfa &raw, u8 somPrecision,
           || !cc.streaming);

    if (!cc.grey.allowGough) {
-        return nullptr;
+        return bytecode_ptr<NFA>(nullptr);
    }

    DEBUG_PRINTF("hello world\n");
@ -1081,11 +1076,11 @@ bytecode_ptr<NFA> goughCompile(raw_som_dfa &raw, u8 somPrecision,
    auto basic_dfa = mcclellanCompile_i(raw, gbs, cc);
    assert(basic_dfa);
    if (!basic_dfa) {
-        return nullptr;
+        return bytecode_ptr<NFA>(nullptr);
    }

-    u8 alphaShift
-        = ((const mcclellan *)getImplNfa(basic_dfa.get()))->alphaShift;
+    const auto nfa = static_cast<const mcclellan *>(getImplNfa(basic_dfa.get()));
+    u8 alphaShift = nfa->alphaShift;
    u32 edge_count = (1U << alphaShift) * raw.states.size();

    u32 curr_offset = ROUNDUP_N(basic_dfa->length, 4);
@ -1126,8 +1121,8 @@ bytecode_ptr<NFA> goughCompile(raw_som_dfa &raw, u8 somPrecision,
    u32 gough_size = ROUNDUP_N(curr_offset, 16);
    auto gough_dfa = make_zeroed_bytecode_ptr<NFA>(gough_size);

-    memcpy(gough_dfa.get(), basic_dfa.get(), basic_dfa->length);
-    memcpy((char *)gough_dfa.get() + haig_offset, &gi, sizeof(gi));
+    memcpy(reinterpret_cast<char *>(gough_dfa.get()), basic_dfa.get(), basic_dfa->length);
+    memcpy(reinterpret_cast<char *>(gough_dfa.get()) + haig_offset, &gi, sizeof(gi));
    if (gough_dfa->type == MCCLELLAN_NFA_16) {
        gough_dfa->type = GOUGH_NFA_16;
    } else {
@ -1140,18 +1135,18 @@ bytecode_ptr<NFA> goughCompile(raw_som_dfa &raw, u8 somPrecision,
    gough_dfa->streamStateSize = base_state_size + slot_count * somPrecision;
    gough_dfa->scratchStateSize = (u32)(16 + scratch_slot_count * sizeof(u64a));

-    mcclellan *m = (mcclellan *)getMutableImplNfa(gough_dfa.get());
+    auto *m = reinterpret_cast<mcclellan *>(getMutableImplNfa(gough_dfa.get()));
    m->haig_offset = haig_offset;

    /* update nfa length, haig_info offset (leave mcclellan length alone) */
    gough_dfa->length = gough_size;

    /* copy in blocks */
-    copy_bytes((u8 *)gough_dfa.get() + edge_prog_offset, edge_blocks);
+    copy_bytes(reinterpret_cast<u8 *>(gough_dfa.get()) + edge_prog_offset, edge_blocks);
    if (top_prog_offset) {
-        copy_bytes((u8 *)gough_dfa.get() + top_prog_offset, top_blocks);
+        copy_bytes(reinterpret_cast<u8 *>(gough_dfa.get()) + top_prog_offset, top_blocks);
    }
-    copy_bytes((u8 *)gough_dfa.get() + prog_base_offset, temp_blocks);
+    copy_bytes(reinterpret_cast<u8 *>(gough_dfa.get()) + prog_base_offset, temp_blocks);

    return gough_dfa;
 }
@ -1184,7 +1179,7 @@ AccelScheme gough_build_strat::find_escape_strings(dstate_id_t this_idx) const {
 void gough_build_strat::buildAccel(dstate_id_t this_idx, const AccelScheme &info,
                                   void *accel_out) {
    assert(mcclellan_build_strat::accelSize() == sizeof(AccelAux));
-    gough_accel *accel = (gough_accel *)accel_out;
+    gough_accel *accel = reinterpret_cast<gough_accel *>(accel_out);
    /* build a plain accelaux so we can work out where we can get to */
    mcclellan_build_strat::buildAccel(this_idx, info, &accel->accel);
    DEBUG_PRINTF("state %hu is accel with type %hhu\n", this_idx,
@ -1299,7 +1294,7 @@ unique_ptr<raw_report_info> gough_build_strat::gatherReports(
    *arbReport = MO_INVALID_IDX;
    assert(!ri->rl.empty()); /* all components should be able to generate
                                reports */
-    return std::move(ri);
+    return ri;
 }

 u32 raw_gough_report_info_impl::getReportListSize() const {
@ -1322,7 +1317,8 @@ void raw_gough_report_info_impl::fillReportLists(NFA *n, size_t base_offset,
    for (const raw_gough_report_list &r : rl) {
        ro.emplace_back(base_offset);

-        gough_report_list *p = (gough_report_list *)((char *)n + base_offset);
+        u8 * n_ptr = reinterpret_cast<u8 *>(n);
+        gough_report_list *p = reinterpret_cast<gough_report_list *>(n_ptr + base_offset);
        u32 i = 0;

        for (const som_report &sr : r.reports) {
--- a/src/nfa/goughcompile_accel.cpp
+++ b/src/nfa/goughcompile_accel.cpp
@ -146,6 +146,7 @@ bool verify_neighbour(const GoughGraph &g, GoughVertex u,
                      const map<gough_edge_id, vector<gough_ins> > &blocks,
                      const set<GoughVertex> &succs,
                      const vector<gough_ins> &block_sl) {
+    // cppcheck-suppress useStlAlgorithm
    for (const auto &e : out_edges_range(u, g)) {
        if (!g[e].reach.any()) { /* ignore top edges */
            continue;
@ -172,6 +173,7 @@ static
 bool verify_neighbour_no_block(const GoughGraph &g, GoughVertex u,
                        const map<gough_edge_id, vector<gough_ins> > &blocks,
                        const set<GoughVertex> &succs) {
+    // cppcheck-suppress useStlAlgorithm
    for (const auto &e : out_edges_range(u, g)) {
        if (!g[e].reach.any()) { /* ignore top edges */
            continue;
@ -229,6 +231,7 @@ bool allow_two_byte_accel(const GoughGraph &g,
            succs.insert(target(e, g));
        }

+        // cppcheck-suppress useStlAlgorithm
        for (auto w : adjacent_vertices_range(v, g)) {
            if (w != v && !verify_neighbour(g, w, blocks, succs, block_sl)) {
                return false;
@ -249,6 +252,7 @@ bool allow_two_byte_accel(const GoughGraph &g,
            }
            succs.insert(target(e, g));

+            // cppcheck-suppress useStlAlgorithm
            for (auto w : adjacent_vertices_range(v, g)) {
                if (w != v && !verify_neighbour_no_block(g, w, blocks, succs)) {
                    return false;
--- a/src/nfa/goughcompile_dump.cpp
+++ b/src/nfa/goughcompile_dump.cpp
@ -145,7 +145,8 @@ void dump_var_mapping(const GoughGraph &g, const string &base,
            fprintf(f, "\tuses:");
            vector<u32> used_id;
            for (const GoughSSAVar *var : used) {
-                used_id.emplace_back(var->slot);
+                // cppcheck-suppress useStlAlgorithm
+                used_id.emplace_back(var->slot);    //NOLINT (performance-inefficient-vector-operation)
            }
            for (const u32 &id : used_id) {
                fprintf(f, " %u", id);
@ -167,7 +168,8 @@ void dump_var_mapping(const GoughGraph &g, const string &base,
            fprintf(f, "\tuses:");
            vector<u32> used_id;
            for (const GoughSSAVar *var : used) {
-                used_id.emplace_back(var->slot);
+                // cppcheck-suppress useStlAlgorithm
+                used_id.emplace_back(var->slot);    //NOLINT (performance-inefficient-vector-operation)
            }
            for (const u32 &id : used_id) {
                fprintf(f, " %u", id);
--- a/src/nfa/goughcompile_reg.cpp
+++ b/src/nfa/goughcompile_reg.cpp
@ -51,6 +51,7 @@ namespace ue2 {
 template<typename VarP, typename VarQ>
 void emplace_back_all_raw(vector<VarP> *out, const vector<VarQ> &in) {
    for (const auto &var : in) {
+        // cppcheck-suppress useStlAlgorithm
        out->emplace_back(var.get());
    }
 }
@ -194,7 +195,7 @@ void handle_pending_vars(GoughSSAVar *def, const GoughGraph &g,
        if (contains(aux.containing_v, var)) {
            /* def is used by join vertex, value only needs to be live on some
             * incoming edges */
-            GoughSSAVarJoin *vj = (GoughSSAVarJoin *)var;
+            const GoughSSAVarJoin *vj = reinterpret_cast<const GoughSSAVarJoin *>(var);
            const flat_set<GoughEdge> &live_edges
                = vj->get_edges_for_input(def);
            for (const auto &e : live_edges) {
@ -278,7 +279,7 @@ set<const GoughSSAVar *> live_during(GoughSSAVar *def, const GoughGraph &g,

 template<typename VarP>
 void set_initial_slots(const vector<VarP> &vars, u32 *next_slot) {
-    for (auto &var : vars) {
+    for (const auto &var : vars) {
        assert(var->slot == INVALID_SLOT);
        var->slot = (*next_slot)++;
    }
@ -380,6 +381,7 @@ template<typename VarP>
 void add_to_dom_ordering(const vector<VarP> &vars,
                         vector<GoughSSAVar *> *out) {
    for (const auto &var : vars) {
+        // cppcheck-suppress useStlAlgorithm
        out->emplace_back(var.get());
    }
 }
@ -438,7 +440,7 @@ void create_slot_mapping(const GoughGraph &cfg, UNUSED u32 old_slot_count,
 }

 static
-void update_local_slots(GoughGraph &g, set<GoughSSAVar *> &locals,
+void update_local_slots(GoughGraph &g, const set<GoughSSAVar *> &locals,
                        u32 local_base) {
    DEBUG_PRINTF("%zu local variables\n", locals.size());
    /* local variables only occur on edges (joins are never local) */
--- a/src/nfa/goughdump.cpp
+++ b/src/nfa/goughdump.cpp
@ -59,14 +59,14 @@ namespace ue2 {
 static
 void goughGetTransitions(const NFA *n, u16 s, u16 *t) {
    assert(isGoughType(n->type));
-    const mcclellan *m = (const mcclellan *)getImplNfa(n);
+    const mcclellan *m = reinterpret_cast<const mcclellan *>(getImplNfa(n));
    const mstate_aux *aux = getAux(n, s);
    const u32 as = m->alphaShift;
    const char *sher_base
-        = (const char *)m - sizeof(struct NFA) + m->sherman_offset;
+        = reinterpret_cast<const char *>(m) - sizeof(struct NFA) + m->sherman_offset;

    if (n->type == GOUGH_NFA_8) {
-        const u8 *succ_table = (const u8 *)((const char *)m + sizeof(mcclellan));
+        const u8 *succ_table = reinterpret_cast<const u8 *>(reinterpret_cast<const char *>(m) + sizeof(mcclellan));
        for (u16 c = 0; c < N_CHARS; c++) {
            t[c] = succ_table[((u32)s << as) + m->remap[c]];
        }
@ -76,14 +76,14 @@ void goughGetTransitions(const NFA *n, u16 s, u16 *t) {
        if (s >= m->sherman_limit) {
            const char *state_base
                = findShermanState(m, sher_base, m->sherman_limit, s);
-            base_s = *(const u16 *)(state_base + SHERMAN_DADDY_OFFSET);
+            base_s = *(reinterpret_cast<const u16 *>(state_base + SHERMAN_DADDY_OFFSET));
        }

-        const u16 *succ_table = (const u16 *)((const char *)m
+        const u16 *succ_table = reinterpret_cast<const u16 *>(reinterpret_cast<const char *>(m)
                                              + sizeof(mcclellan));
        for (u16 c = 0; c < N_CHARS; c++) {
            const u8 *addr
-                = (const u8*)(succ_table + (((u32)base_s << as) + m->remap[c]));
+                = reinterpret_cast<const u8*>(succ_table + (((u32)base_s << as) + m->remap[c]));
            t[c] = unaligned_load_u16(addr);
            t[c] &= STATE_MASK;
        }
@ -91,15 +91,15 @@ void goughGetTransitions(const NFA *n, u16 s, u16 *t) {
        if (s >= m->sherman_limit) {
            const char *state_base
                = findShermanState(m, sher_base, m->sherman_limit, s);
-            u8 len = *(const u8 *)(SHERMAN_LEN_OFFSET + state_base);
-            const u8 *chars = (const u8 *)state_base + SHERMAN_CHARS_OFFSET;
+            u8 len = *(reinterpret_cast<const u8 *>(SHERMAN_LEN_OFFSET + state_base));
+            const u8 *chars = reinterpret_cast<const u8 *>(state_base) + SHERMAN_CHARS_OFFSET;
            const u16 *states
-                = (const u16 *)(state_base + SHERMAN_STATES_OFFSET(len));
+                = reinterpret_cast<const u16 *>(state_base + SHERMAN_STATES_OFFSET(len));

            for (u8 i = 0; i < len; i++) {
                for (u16 c = 0; c < N_CHARS; c++) {
                    if (m->remap[c] != chars[i]) {
-                        t[c] = unaligned_load_u16((const u8*)&states[i])
+                        t[c] = unaligned_load_u16(reinterpret_cast<const u8*>(&states[i]))
                             & STATE_MASK;
                    }
                }
@ -116,14 +116,14 @@ void describeNode(const NFA *n, const mcclellan *m, u16 i, FILE *f) {

    bool isSherman = m->sherman_limit && i >= m->sherman_limit;
    const char *sher_base
-        = (const char *)m - sizeof(NFA) + m->sherman_offset;
+        = reinterpret_cast<const char *>(m) - sizeof(NFA) + m->sherman_offset;

    fprintf(f, "%u [ width = 1, fixedsize = true, fontsize = 12, "
            "label = \"%u%s\" ]; \n", i, i, isSherman ? "w":"");

    if (aux->accel_offset) {
        dumpAccelDot(f, i,
-          &((const gough_accel *)((const char *)m + aux->accel_offset))->accel);
+          &(reinterpret_cast<const gough_accel *>(reinterpret_cast<const char *>(m) + aux->accel_offset))->accel);
    }

    if (aux->accept_eod) {
@ -151,7 +151,7 @@ void describeNode(const NFA *n, const mcclellan *m, u16 i, FILE *f) {
        const char *sherman_state
            = findShermanState(m, sher_base, m->sherman_limit, i);
        fprintf(f, "%u [ fillcolor = lightblue style=filled ];\n", i);
-        u16 daddy = *(const u16 *)(sherman_state + SHERMAN_DADDY_OFFSET);
+        u16 daddy = *(reinterpret_cast<const u16 *>(sherman_state + SHERMAN_DADDY_OFFSET));
        if (daddy) {
            fprintf(f, "%u -> %u [ color=royalblue style=dashed weight=0.1]\n",
                    i, daddy);
@ -197,7 +197,7 @@ void dump_programs(FILE *f, const NFA *nfa,
    for (set<pair<pair<u32, u32>, u32 > >::const_iterator it
             = prog_dump.begin(); it != prog_dump.end(); ++it) {
        assert(it->second);
-        const gough_ins *p = (const gough_ins *)((const u8 *)nfa + it->second);
+        const gough_ins *p = reinterpret_cast<const gough_ins *>(reinterpret_cast<const u8 *>(nfa) + it->second);
        dump_program(f, it->first, p);
    }
 }
@ -205,17 +205,17 @@ void dump_programs(FILE *f, const NFA *nfa,
 static
 void dumpTransitions(const NFA *nfa, FILE *f,
                     set<pair<pair<u32, u32>, u32 > > *prog_dump) {
-    const mcclellan *m = (const mcclellan *)getImplNfa(nfa);
+    const mcclellan *m = reinterpret_cast<const mcclellan *>(getImplNfa(nfa));
    const gough_info *g = get_gough(m);
    u32 alphaSize = 1U << m->alphaShift;
-    const u32 *prog_offset_table = (const u32 *)(g + 1);
+    const u32 *prog_offset_table = reinterpret_cast<const u32 *>(g + 1);

    for (u16 i = 0; i < m->state_count; i++) {
        fprintf(f, "%05hu", i);
        const mstate_aux *aux = getAux(nfa, i);

        if (aux->accel_offset) {
-            dumpAccelText(f, (const union AccelAux *)((const char *)m +
+            dumpAccelText(f, reinterpret_cast<const union AccelAux *>(reinterpret_cast<const char *>(m) +
                                                      aux->accel_offset));
        }

@ -263,7 +263,7 @@ void dumpTransitions(const NFA *nfa, FILE *f,
 static
 void nfaExecGough8_dumpDot(const struct NFA *nfa, FILE *f) {
    assert(nfa->type == GOUGH_NFA_8);
-    const mcclellan *m = (const mcclellan *)getImplNfa(nfa);
+    const mcclellan *m = reinterpret_cast<const mcclellan *>(getImplNfa(nfa));

    dumpDotPreambleDfa(f);

@ -284,7 +284,7 @@ static
 void nfaExecGough8_dumpText(const struct NFA *nfa, FILE *f) {

    assert(nfa->type == GOUGH_NFA_8);
-    const mcclellan *m = (const mcclellan *)getImplNfa(nfa);
+    const mcclellan *m = reinterpret_cast<const mcclellan *>(getImplNfa(nfa));

    fprintf(f, "gough 8\n");
    fprintf(f, "report: %u, states %u, length %u\n", m->arb_report,
@ -308,7 +308,7 @@ void nfaExecGough8_dumpText(const struct NFA *nfa, FILE *f) {
 static
 void nfaExecGough16_dumpDot(const struct NFA *nfa, FILE *f) {
    assert(nfa->type == GOUGH_NFA_16);
-    const mcclellan *m = (const mcclellan *)getImplNfa(nfa);
+    const mcclellan *m = reinterpret_cast<const mcclellan *>(getImplNfa(nfa));

    dumpDotPreambleDfa(f);

@ -328,7 +328,7 @@ void nfaExecGough16_dumpDot(const struct NFA *nfa, FILE *f) {
 static
 void nfaExecGough16_dumpText(const struct NFA *nfa, FILE *f) {
    assert(nfa->type == GOUGH_NFA_16);
-    const mcclellan *m = (const mcclellan *)getImplNfa(nfa);
+    const mcclellan *m = reinterpret_cast<const mcclellan *>(getImplNfa(nfa));
    //    const gough_info *h = get_gough(m);

    fprintf(f, "gough 16\n");
@ -336,7 +336,7 @@ void nfaExecGough16_dumpText(const struct NFA *nfa, FILE *f) {
            m->state_count, m->length);
    fprintf(f, "astart: %hu, fstart: %hu\n", m->start_anchored,
            m->start_floating);
-    fprintf(f, "single accept: %d\n", !!(int)m->flags & MCCLELLAN_FLAG_SINGLE);
+    fprintf(f, "single accept: %d\n", !!(m->flags & MCCLELLAN_FLAG_SINGLE));
    fprintf(f, "sherman_limit: %u, sherman_end: %u\n", m->sherman_limit,
            m->sherman_end);

--- a/src/nfa/lbr.c
+++ b/src/nfa/lbr.c
@ -307,7 +307,7 @@ char lbrMatchLoop(const struct lbr_common *l, const u64a begin, const u64a end,
 static really_inline
 char lbrRevScanDot(UNUSED const struct NFA *nfa, UNUSED const u8 *buf,
                   UNUSED size_t begin, UNUSED size_t end,
-                   UNUSED size_t *loc) {
+                   UNUSED const size_t *loc) {
    assert(begin <= end);
    assert(nfa->type == LBR_NFA_DOT);
    // Nothing can kill a dot!
@ -413,7 +413,7 @@ char lbrRevScanTruf(const struct NFA *nfa, const u8 *buf,
 static really_inline
 char lbrFwdScanDot(UNUSED const struct NFA *nfa, UNUSED const u8 *buf,
                   UNUSED size_t begin, UNUSED size_t end,
-                   UNUSED size_t *loc) {
+                   UNUSED const size_t *loc) {
    assert(begin <= end);
    assert(nfa->type == LBR_NFA_DOT);
    // Nothing can kill a dot!
--- a/src/nfa/lbr_common_impl.h
+++ b/src/nfa/lbr_common_impl.h
@ -180,7 +180,7 @@ found_top:;

        u64a ep = MIN(MIN(end, (s64a)q->length) + offset, first_match);
        if (ep > sp && sp >= offset) {
-            size_t eloc;
+            size_t eloc = 0;
            DEBUG_PRINTF("rev b%llu e%llu/%zu\n", sp - offset, ep - offset,
                         q->length);
            assert(ep - offset <= q->length);
@ -279,6 +279,7 @@ char JOIN(ENGINE_EXEC_NAME, _Q_i)(const struct NFA *nfa, struct mq *q,
                assert(rv == MO_CONTINUE_MATCHING);
            }

+            // cppcheck-suppress knownConditionTrueFalse
            if (escape_found) {
                DEBUG_PRINTF("clearing repeat due to escape\n");
                clearRepeat(info, lstate);
@ -355,6 +356,7 @@ void JOIN(ENGINE_EXEC_NAME, _StreamSilent)(const struct NFA *nfa, struct mq *q,

    size_t eloc = 0;
    char escaped = FWDSCAN_FN(nfa, buf, 0, length, &eloc);
+    // cppcheck-suppress knownConditionTrueFalse
    if (escaped) {
        assert(eloc < length);
        DEBUG_PRINTF("escape found at %zu, clearing repeat\n", eloc);
--- a/src/nfa/lbr_dump.cpp
+++ b/src/nfa/lbr_dump.cpp
@ -56,7 +56,7 @@ namespace ue2 {
 static
 void lbrDumpCommon(const lbr_common *lc, FILE *f) {
    const RepeatInfo *info
-        = (const RepeatInfo *)((const char *)lc + lc->repeatInfoOffset);
+        = reinterpret_cast<const RepeatInfo *>(reinterpret_cast<const char *>(lc) + lc->repeatInfoOffset);
    fprintf(f, "Limited Bounded Repeat\n");
    fprintf(f, "\n");
    fprintf(f, "repeat model:  %s\n", repeatTypeName(info->type));
@ -70,7 +70,7 @@ void lbrDumpCommon(const lbr_common *lc, FILE *f) {
 void nfaExecLbrDot_dump(const NFA *nfa, const string &base) {
    assert(nfa);
    assert(nfa->type == LBR_NFA_DOT);
-    const lbr_dot *ld = (const lbr_dot *)getImplNfa(nfa);
+    const lbr_dot *ld = reinterpret_cast<const lbr_dot *>(getImplNfa(nfa));
    StdioFile f(base + ".txt", "w");
    lbrDumpCommon(&ld->common, f);
    fprintf(f, "DOT model\n");
@ -81,7 +81,7 @@ void nfaExecLbrDot_dump(const NFA *nfa, const string &base) {
 void nfaExecLbrVerm_dump(const NFA *nfa, const string &base) {
    assert(nfa);
    assert(nfa->type == LBR_NFA_VERM);
-    const lbr_verm *lv = (const lbr_verm *)getImplNfa(nfa);
+    const lbr_verm *lv = reinterpret_cast<const lbr_verm *>(getImplNfa(nfa));
    StdioFile f(base + ".txt", "w");
    lbrDumpCommon(&lv->common, f);
    fprintf(f, "VERM model, scanning for 0x%02x\n", lv->c);
@ -92,7 +92,7 @@ void nfaExecLbrVerm_dump(const NFA *nfa, const string &base) {
 void nfaExecLbrNVerm_dump(const NFA *nfa, const string &base) {
    assert(nfa);
    assert(nfa->type == LBR_NFA_NVERM);
-    const lbr_verm *lv = (const lbr_verm *)getImplNfa(nfa);
+    const lbr_verm *lv = reinterpret_cast<const lbr_verm *>(getImplNfa(nfa));
    StdioFile f(base + ".txt", "w");
    lbrDumpCommon(&lv->common, f);
    fprintf(f, "NEGATED VERM model, scanning for 0x%02x\n", lv->c);
@ -106,11 +106,11 @@ void nfaExecLbrShuf_dump(const NFA *nfa, const string &base) {

    StdioFile f(base + ".txt", "w");

-    const lbr_shuf *ls = (const lbr_shuf *)getImplNfa(nfa);
+    const lbr_shuf *ls = reinterpret_cast<const lbr_shuf *>(getImplNfa(nfa));
    lbrDumpCommon(&ls->common, f);

-    CharReach cr = shufti2cr((const u8 *)&ls->mask_lo,
-                             (const u8 *)&ls->mask_hi);
+    CharReach cr = shufti2cr(reinterpret_cast<const u8 *>(&ls->mask_lo),
+                             reinterpret_cast<const u8 *>(&ls->mask_hi));
    fprintf(f, "SHUF model, scanning for: %s (%zu chars)\n",
            describeClass(cr, 20, CC_OUT_TEXT).c_str(), cr.count());
    fprintf(f, "\n");
@ -123,11 +123,11 @@ void nfaExecLbrTruf_dump(const NFA *nfa, const string &base) {

    StdioFile f(base + ".txt", "w");

-    const lbr_truf *lt = (const lbr_truf *)getImplNfa(nfa);
+    const lbr_truf *lt = reinterpret_cast<const lbr_truf *>(getImplNfa(nfa));
    lbrDumpCommon(&lt->common, f);

-    CharReach cr = truffle2cr((const u8 *)&lt->mask1,
-                              (const u8 *)&lt->mask2);
+    CharReach cr = truffle2cr(reinterpret_cast<const u8 *>(&lt->mask1),
+                              reinterpret_cast<const u8 *>(&lt->mask2));
    fprintf(f, "TRUFFLE model, scanning for: %s (%zu chars)\n",
            describeClass(cr, 20, CC_OUT_TEXT).c_str(), cr.count());
    fprintf(f, "\n");
--- a/src/nfa/limex.h
+++ b/src/nfa/limex.h
@ -56,7 +56,7 @@ extern "C"
    char gf_name##_Q(const struct NFA *n, struct mq *q, s64a end);             \
    char gf_name##_Q2(const struct NFA *n, struct mq *q, s64a end);            \
    char gf_name##_QR(const struct NFA *n, struct mq *q, ReportID report);     \
-    char gf_name##_reportCurrent(const struct NFA *n, struct mq *q);           \
+    char gf_name##_reportCurrent(const struct NFA *n, const struct mq *q);     \
    char gf_name##_inAccept(const struct NFA *n, ReportID report,              \
                            struct mq *q);                                     \
    char gf_name##_inAnyAccept(const struct NFA *n, struct mq *q);             \
--- a/src/nfa/limex_common_impl.h
+++ b/src/nfa/limex_common_impl.h
@ -125,6 +125,7 @@ char PROCESS_ACCEPTS_IMPL_FN(const IMPL_NFA_T *limex, const STATE_T *s,
    const STATE_T accept_mask = *acceptMask;
    STATE_T accepts = AND_STATE(*s, accept_mask);

+    DEBUG_PRINTF("sizeof(STATE_T): %ld, sizeof(CHUNK_T): %ld, NUM_STATE_CHUNKS: %ld\n", sizeof(STATE_T), sizeof(CHUNK_T), NUM_STATE_CHUNKS);
    // Caller must ensure that we have at least one accept state on.
    assert(ISNONZERO_STATE(accepts));

@ -135,6 +136,7 @@ char PROCESS_ACCEPTS_IMPL_FN(const IMPL_NFA_T *limex, const STATE_T *s,
    memcpy(mask_chunks, &accept_mask, sizeof(accept_mask));

    u32 base_index = 0; // Cumulative sum of mask popcount up to current chunk.
+    // cppcheck-suppress unsignedLessThanZero
    for (u32 i = 0; i < NUM_STATE_CHUNKS; i++) {
        CHUNK_T chunk = chunks[i];
        while (chunk != 0) {
@ -332,7 +334,7 @@ void EXPIRE_ESTATE_FN(const IMPL_NFA_T *limex, struct CONTEXT_T *ctx,
 // UE-1636) need to guard cyclic tug-accepts as well.
 static really_inline
 char LIMEX_INACCEPT_FN(const IMPL_NFA_T *limex, STATE_T state,
-                       union RepeatControl *repeat_ctrl, char *repeat_state,
+                       const union RepeatControl *repeat_ctrl, const char *repeat_state,
                       u64a offset, ReportID report) {
    assert(limex);

@ -358,6 +360,7 @@ char LIMEX_INACCEPT_FN(const IMPL_NFA_T *limex, STATE_T state,
    memcpy(mask_chunks, &accept_mask, sizeof(accept_mask));

    u32 base_index = 0; // Cumulative sum of mask popcount up to current chunk.
+    // cppcheck-suppress unsignedLessThanZero
    for (u32 i = 0; i < NUM_STATE_CHUNKS; i++) {
        CHUNK_T chunk = chunks[i];
        while (chunk != 0) {
@ -382,7 +385,7 @@ char LIMEX_INACCEPT_FN(const IMPL_NFA_T *limex, STATE_T state,

 static really_inline
 char LIMEX_INANYACCEPT_FN(const IMPL_NFA_T *limex, STATE_T state,
-                          union RepeatControl *repeat_ctrl, char *repeat_state,
+                          const union RepeatControl *repeat_ctrl, const char *repeat_state,
                          u64a offset) {
    assert(limex);

--- a/src/nfa/limex_compile.cpp
+++ b/src/nfa/limex_compile.cpp
@ -140,6 +140,7 @@ reindexByStateId(const unordered_map<NFAVertex, NFAStateSet> &in,
        for (size_t i = m.second.find_first(); i != m.second.npos;
             i = m.second.find_next(i)) {
            u32 state_id = indexToState[i];
+	    // cppcheck-suppress knownConditionTrueFalse
            if (state_id == NO_STATE) {
                continue;
            }
@ -269,7 +270,7 @@ void maskClear(Mask &m) {
 template<class Mask>
 u8 *maskGetByte(Mask &m, u32 bit) {
    assert(bit < sizeof(m)*8);
-    u8 *m8 = (u8 *)&m;
+    u8 *m8 = reinterpret_cast<u8 *>(&m);

    return m8 + bit/8;
 }
@ -290,7 +291,7 @@ void maskSetBits(Mask &m, const NFAStateSet &bits) {

 template<class Mask>
 bool isMaskZero(Mask &m) {
-    u8 *m8 = (u8 *)&m;
+    const u8 *m8 = reinterpret_cast<u8 *>(&m);
    for (u32 i = 0; i < sizeof(m); i++) {
        if (m8[i]) {
            return false;
@ -303,7 +304,7 @@ bool isMaskZero(Mask &m) {
 template<class Mask>
 void maskSetByte(Mask &m, const unsigned int idx, const char val) {
    assert(idx < sizeof(m));
-    char *m8 = (char *)&m;
+    char *m8 = reinterpret_cast<char *>(&m);
    char &byte = m8[idx];
    byte = val;
 }
@ -329,11 +330,12 @@ void buildReachMapping(const build_info &args, vector<NFAStateSet> &reach,
    // Build a list of vertices with a state index assigned.
    vector<NFAVertex> verts;
    verts.reserve(args.num_states);
-    for (auto v : vertices_range(h)) {
-        if (state_ids.at(v) != NO_STATE) {
-            verts.emplace_back(v);
-        }
-    }
+    auto sidat = [&state_ids=state_ids](const NFAVertex &v) {
+        // cppcheck-suppress knownConditionTrueFalse
+        return (state_ids.at(v) != NO_STATE);
+    };
+    const auto &vr = vertices_range(h);
+    std::copy_if(begin(vr), end(vr),  std::back_inserter(verts), sidat);

    // Build a mapping from set-of-states -> reachability.
    map<NFAStateSet, CharReach> mapping;
@ -482,6 +484,7 @@ bool allow_wide_accel(NFAVertex v, const NGHolder &g, NFAVertex sds_or_proxy) {
 static
 bool allow_wide_accel(const vector<NFAVertex> &vv, const NGHolder &g,
                      NFAVertex sds_or_proxy) {
+    // cppcheck-suppress useStlAlgorithm
    for (auto v : vv) {
        if (allow_wide_accel(v, g, sds_or_proxy)) {
            return true;
@ -555,7 +558,8 @@ void filterAccelStates(NGHolder &g, const map<u32, set<NFAVertex>> &tops,

    // Similarly, connect (start, startDs) if necessary.
    if (!edge(g.start, g.startDs, g).second) {
-        NFAEdge e = add_edge(g.start, g.startDs, g);
+        NFAEdge e;
+        std::tie(e, std::ignore) = add_edge(g.start, g.startDs, g);
        tempEdges.emplace_back(e); // Remove edge later.
    }

@ -584,6 +588,7 @@ bool containsBadSubset(const limex_accel_info &accel,
        subset = state_set;
        subset.reset(j);

+        // cppcheck-suppress knownConditionTrueFalse
        if (effective_sds != NO_STATE && subset.count() == 1 &&
            subset.test(effective_sds)) {
            continue;
@ -623,7 +628,8 @@ void fillAccelInfo(build_info &bi) {

    vector<NFAVertex> astates;
    for (const auto &m : accel_map) {
-        astates.emplace_back(m.first);
+        // cppcheck-suppress useStlAlgorithm
+        astates.emplace_back(m.first);  //NOLINT (performance-inefficient-vector-operation)
    }

    NFAStateSet useful(num_states);
@ -799,12 +805,14 @@ u32 getEffectiveAccelStates(const build_info &args,
            continue;
        }
        for (const auto &s_mask : args.squashMap | map_values) {
+            // cppcheck-suppress useStlAlgorithm
            if (!s_mask.test(state_id)) {
                may_turn_off |= 1U << accel_id;
                break;
            }
        }
        for (const auto &s_mask : args.reportSquashMap | map_values) {
+            // cppcheck-suppress useStlAlgorithm
            if (!s_mask.test(state_id)) {
                may_turn_off |= 1U << accel_id;
                break;
@ -914,11 +922,13 @@ void buildAccel(const build_info &args, NFAStateSet &accelMask,

    // Start with the NONE case.
    auxvec.emplace_back(AccelAux());
+    // cppcheck-suppress memsetClassFloat
    memset(&auxvec[0], 0, sizeof(AccelAux));
    auxvec[0].accel_type = ACCEL_NONE; // no states on.

    AccelAux aux;
    for (u32 i = 1; i < accelCount; i++) {
+        // cppcheck-suppress memsetClassFloat
        memset(&aux, 0, sizeof(aux));

        NFAStateSet effective_states(args.num_states);
@ -1064,7 +1074,7 @@ void buildAcceptsList(const build_info &args, ReportListCache &reports_cache,
            a.reports = addReports(h[v].reports, reports, reports_cache);
        }
        a.squash = addSquashMask(args, v, squash);
-        accepts.emplace_back(std::move(a));
+        accepts.emplace_back(a);
    }
 }

@ -1083,6 +1093,7 @@ void buildAccepts(const build_info &args, ReportListCache &reports_cache,
    for (auto v : vertices_range(h)) {
        u32 state_id = args.state_ids.at(v);

+        // cppcheck-suppress knownConditionTrueFalse
        if (state_id == NO_STATE || !is_match_vertex(v, h)) {
            continue;
        }
@ -1142,6 +1153,7 @@ u32 compressedStateSize(const NGHolder &h, const NFAStateSet &maskedStates,

    for (auto v : vertices_range(h)) {
        u32 i = state_ids.at(v);
+        // cppcheck-suppress knownConditionTrueFalse
        if (i == NO_STATE || maskedStates.test(i)) {
            continue;
        }
@ -1167,6 +1179,7 @@ bool hasSquashableInitDs(const build_info &args) {

    NFAStateSet initDs(args.num_states);
    u32 sds_state = args.state_ids.at(h.startDs);
+    // cppcheck-suppress knownConditionTrueFalse
    if (sds_state == NO_STATE) {
        DEBUG_PRINTF("no states in initds\n");
        return false;
@ -1208,10 +1221,11 @@ bool hasSquashableInitDs(const build_info &args) {
 static
 bool hasInitDsStates(const NGHolder &h,
                     const unordered_map<NFAVertex, u32> &state_ids) {
+    // cppcheck-suppress knownConditionTrueFalse
    if (state_ids.at(h.startDs) != NO_STATE) {
        return true;
    }
-
+    // cppcheck-suppress knownConditionTrueFalse
    if (is_triggered(h) && state_ids.at(h.start) != NO_STATE) {
        return true;
    }
@ -1227,6 +1241,7 @@ void findMaskedCompressionStates(const build_info &args,
        // Rose leftfixes can mask out initds, which is worth doing if it will
        // stay on forever (i.e. it's not squashable).
        u32 sds_i = args.state_ids.at(h.startDs);
+        // cppcheck-suppress knownConditionTrueFalse
        if (sds_i != NO_STATE && !hasSquashableInitDs(args)) {
            maskedStates.set(sds_i);
            DEBUG_PRINTF("masking out initds state\n");
@ -1242,6 +1257,7 @@ void findMaskedCompressionStates(const build_info &args,
        for (const auto &e : edges_range(h)) {
            u32 from = args.state_ids.at(source(e, h));
            u32 to = args.state_ids.at(target(e, h));
+            // cppcheck-suppress knownConditionTrueFalse
            if (from == NO_STATE) {
                continue;
            }
@ -1249,6 +1265,7 @@ void findMaskedCompressionStates(const build_info &args,
            // We cannot mask out EOD accepts, as they have to perform an
            // action after they're switched on that may be delayed until the
            // next stream write.
+            // cppcheck-suppress knownConditionTrueFalse
            if (to == NO_STATE && target(e, h) != h.acceptEod) {
                continue;
            }
@ -1399,6 +1416,7 @@ u32 buildExceptionMap(const build_info &args, ReportListCache &reports_cache,
    for (auto v : vertices_range(h)) {
        const u32 i = args.state_ids.at(v);

+        // cppcheck-suppress knownConditionTrueFalse
        if (i == NO_STATE) {
            continue;
        }
@ -1481,6 +1499,8 @@ u32 buildExceptionMap(const build_info &args, ReportListCache &reports_cache,
                    continue;
                }
                u32 j = args.state_ids.at(w);
+                // j can be NO_STATE if args.state_ids.at(w) returns NO_STATE
+                // cppcheck-suppress knownConditionTrueFalse
                if (j == NO_STATE) {
                    continue;
                }
@ -1553,6 +1573,7 @@ u32 findMaxVarShift(const build_info &args, u32 nShifts) {
    for (const auto &e : edges_range(h)) {
        u32 from = args.state_ids.at(source(e, h));
        u32 to = args.state_ids.at(target(e, h));
+        // cppcheck-suppress knownConditionTrueFalse
        if (from == NO_STATE || to == NO_STATE) {
            continue;
        }
@ -1572,7 +1593,7 @@ u32 findMaxVarShift(const build_info &args, u32 nShifts) {
 static
 int getLimexScore(const build_info &args, u32 nShifts) {
    const NGHolder &h = args.h;
-    u32 maxVarShift = nShifts;
+    u32 maxVarShift;
    int score = 0;

    score += SHIFT_COST * nShifts;
@ -1582,6 +1603,7 @@ int getLimexScore(const build_info &args, u32 nShifts) {
    for (const auto &e : edges_range(h)) {
        u32 from = args.state_ids.at(source(e, h));
        u32 to = args.state_ids.at(target(e, h));
+        // cppcheck-suppress knownConditionTrueFalse
        if (from == NO_STATE || to == NO_STATE) {
            continue;
        }
@ -1678,7 +1700,7 @@ static
 bool cannotDie(const build_info &args) {
    const auto &h = args.h;
    const auto &state_ids = args.state_ids;
-
+    // cppcheck-suppress knownConditionTrueFalse
    // If we have a startDs we're actually using, we can't die.
    if (state_ids.at(h.startDs) != NO_STATE) {
        DEBUG_PRINTF("is using startDs\n");
@ -1700,7 +1722,7 @@ struct Factory {
    static
    void allocState(NFA *nfa, u32 repeatscratchStateSize,
                    u32 repeatStreamState) {
-        implNFA_t *limex = (implNFA_t *)getMutableImplNfa(nfa);
+        const implNFA_t *limex = reinterpret_cast<implNFA_t *>(getMutableImplNfa(nfa));

        // LimEx NFAs now store the following in state:
        // 1. state bitvector (always present)
@ -1766,7 +1788,7 @@ struct Factory {
            u32 tableOffset, tugMaskOffset;
            size_t len = repeatAllocSize(br, &tableOffset, &tugMaskOffset);
            auto info = make_zeroed_bytecode_ptr<NFARepeatInfo>(len);
-            char *info_ptr = (char *)info.get();
+            char *info_ptr = reinterpret_cast<char *>(info.get());

            // Collect state space info.
            RepeatStateInfo rsi(br.type, br.repeatMin, br.repeatMax, br.minPeriod);
@ -1781,8 +1803,7 @@ struct Factory {
            info->tugMaskOffset = tugMaskOffset;

            // Fill the RepeatInfo structure.
-            RepeatInfo *repeat =
-                (RepeatInfo *)(info_ptr + sizeof(NFARepeatInfo));
+            RepeatInfo *repeat = reinterpret_cast<RepeatInfo *>(info_ptr + sizeof(NFARepeatInfo));
            repeat->type = br.type;
            repeat->repeatMin = depth_to_u32(br.repeatMin);
            repeat->repeatMax = depth_to_u32(br.repeatMax);
@ -1808,7 +1829,7 @@ struct Factory {
            }

            // Fill the tug mask.
-            tableRow_t *tugMask = (tableRow_t *)(info_ptr + tugMaskOffset);
+            tableRow_t *tugMask = reinterpret_cast<tableRow_t *>(info_ptr + tugMaskOffset);
            for (auto v : br.tug_triggers) {
                u32 state_id = args.state_ids.at(v);
                assert(state_id != NO_STATE);
@ -1831,6 +1852,7 @@ struct Factory {
        u32 s_i = args.state_ids.at(h.start);
        u32 sds_i = args.state_ids.at(h.startDs);

+        // cppcheck-suppress knownConditionTrueFalse
        if (s_i != NO_STATE) {
            maskSetBit(limex->init, s_i);
            if (is_triggered(h)) {
@ -1838,6 +1860,7 @@ struct Factory {
            }
        }

+        // cppcheck-suppress knownConditionTrueFalse
        if (sds_i != NO_STATE) {
            maskSetBit(limex->init, sds_i);
            maskSetBit(limex->initDS, sds_i);
@ -1873,6 +1896,7 @@ struct Factory {
        for (const auto &e : edges_range(h)) {
            u32 from = args.state_ids.at(source(e, h));
            u32 to = args.state_ids.at(target(e, h));
+            // cppcheck-suppress knownConditionTrueFalse
            if (from == NO_STATE || to == NO_STATE) {
                continue;
            }
@ -1911,6 +1935,7 @@ struct Factory {
        for (const auto &e : edges_range(h)) {
            u32 from = args.state_ids.at(source(e, h));
            u32 to = args.state_ids.at(target(e, h));
+            // cppcheck-suppress knownConditionTrueFalse
            if (from == NO_STATE || to == NO_STATE) {
                continue;
            }
@ -1929,7 +1954,7 @@ struct Factory {
                         const u32 reportListOffset) {
        DEBUG_PRINTF("exceptionsOffset=%u\n", exceptionsOffset);

-        exception_t *etable = (exception_t *)((char *)limex + exceptionsOffset);
+        exception_t *etable = reinterpret_cast<exception_t *>(reinterpret_cast<char *>(limex) + exceptionsOffset);
        assert(ISALIGNED(etable));

        map<u32, ExceptionProto> exception_by_state;
@ -1977,10 +2002,10 @@ struct Factory {
        limex->exceptionCount = ecount;

        if (args.num_states > 64 && args.cc.target_info.has_avx512vbmi()) {
-            const u8 *exceptionMask = (const u8 *)(&limex->exceptionMask);
-            u8 *shufMask = (u8 *)&limex->exceptionShufMask;
-            u8 *bitMask = (u8 *)&limex->exceptionBitMask;
-            u8 *andMask = (u8 *)&limex->exceptionAndMask;
+            const u8 *exceptionMask = reinterpret_cast<const u8 *>(&limex->exceptionMask);
+            u8 *shufMask = reinterpret_cast<u8 *>(&limex->exceptionShufMask);
+            u8 *bitMask = reinterpret_cast<u8 *>(&limex->exceptionBitMask);
+            u8 *andMask = reinterpret_cast<u8 *>(&limex->exceptionAndMask);

            u32 tot_cnt = 0;
            u32 pos = 0;
@ -2040,7 +2065,7 @@ struct Factory {
        copy(reachMap.begin(), reachMap.end(), &limex->reachMap[0]);

        // Reach table is right after the LimEx structure.
-        tableRow_t *reachMask = (tableRow_t *)((char *)limex + reachOffset);
+        tableRow_t *reachMask = reinterpret_cast<tableRow_t *>(reinterpret_cast<char *>(limex) + reachOffset);
        assert(ISALIGNED(reachMask));
        for (size_t i = 0, end = reach.size(); i < end; i++) {
            maskSetBits(reachMask[i], reach[i]);
@ -2054,7 +2079,7 @@ struct Factory {
        DEBUG_PRINTF("topsOffset=%u\n", topsOffset);

        limex->topOffset = topsOffset;
-        tableRow_t *topMasks = (tableRow_t *)((char *)limex + topsOffset);
+        tableRow_t *topMasks = reinterpret_cast<tableRow_t *>(reinterpret_cast<char *>(limex) + topsOffset);
        assert(ISALIGNED(topMasks));

        for (size_t i = 0, end = tops.size(); i < end; i++) {
@ -2066,8 +2091,8 @@ struct Factory {

    static
    void writeAccelSsse3Masks(const NFAStateSet &accelMask, implNFA_t *limex) {
-        char *perm_base = (char *)&limex->accelPermute;
-        char *comp_base = (char *)&limex->accelCompare;
+        char *perm_base = reinterpret_cast<char *>(&limex->accelPermute);
+        char *comp_base = reinterpret_cast<char *>(&limex->accelCompare);

        u32 num = 0; // index in accel table.
        for (size_t i = accelMask.find_first(); i != accelMask.npos;
@ -2078,8 +2103,8 @@ struct Factory {
            // PSHUFB permute and compare masks
            size_t mask_idx = sizeof(u_128) * (state_id / 128U);
            DEBUG_PRINTF("mask_idx=%zu\n", mask_idx);
-            u_128 *perm = (u_128 *)(perm_base + mask_idx);
-            u_128 *comp = (u_128 *)(comp_base + mask_idx);
+            u_128 *perm = reinterpret_cast<u_128 *>(perm_base + mask_idx);
+            u_128 *comp = reinterpret_cast<u_128 *>(comp_base + mask_idx);
            maskSetByte(*perm, num, ((state_id % 128U) / 8U));
            maskSetByte(*comp, num, ~(1U << (state_id % 8U)));
        }
@ -2097,11 +2122,11 @@ struct Factory {
        // Write accel lookup table.
        limex->accelTableOffset = accelTableOffset;
        copy(accelTable.begin(), accelTable.end(),
-             (u8 *)((char *)limex + accelTableOffset));
+             reinterpret_cast<u8 *>(reinterpret_cast<char *>(limex) + accelTableOffset));

        // Write accel aux structures.
        limex->accelAuxOffset = accelAuxOffset;
-        AccelAux *auxTable = (AccelAux *)((char *)limex + accelAuxOffset);
+        AccelAux *auxTable = reinterpret_cast<AccelAux *>(reinterpret_cast<char *>(limex) + accelAuxOffset);
        assert(ISALIGNED(auxTable));
        copy(accelAux.begin(), accelAux.end(), auxTable);

@ -2131,7 +2156,7 @@ struct Factory {
                      const vector<NFAStateSet> &squash, implNFA_t *limex,
                      const u32 acceptsOffset, const u32 acceptsEodOffset,
                      const u32 squashOffset, const u32 reportListOffset) {
-        char *limex_base = (char *)limex;
+        char *limex_base = reinterpret_cast<char *>(limex);

        DEBUG_PRINTF("acceptsOffset=%u, acceptsEodOffset=%u, squashOffset=%u\n",
                     acceptsOffset, acceptsEodOffset, squashOffset);
@ -2154,7 +2179,7 @@ struct Factory {
        limex->acceptOffset = acceptsOffset;
        limex->acceptCount = verify_u32(accepts.size());
        DEBUG_PRINTF("NFA has %zu accepts\n", accepts.size());
-        NFAAccept *acceptsTable = (NFAAccept *)(limex_base + acceptsOffset);
+        NFAAccept *acceptsTable = reinterpret_cast<NFAAccept *>(limex_base + acceptsOffset);
        assert(ISALIGNED(acceptsTable));
        transform(accepts.begin(), accepts.end(), acceptsTable,
                  transform_offset_fn);
@ -2163,7 +2188,7 @@ struct Factory {
        limex->acceptEodOffset = acceptsEodOffset;
        limex->acceptEodCount = verify_u32(acceptsEod.size());
        DEBUG_PRINTF("NFA has %zu EOD accepts\n", acceptsEod.size());
-        NFAAccept *acceptsEodTable = (NFAAccept *)(limex_base + acceptsEodOffset);
+        NFAAccept *acceptsEodTable = reinterpret_cast<NFAAccept *>(limex_base + acceptsEodOffset);
        assert(ISALIGNED(acceptsEodTable));
        transform(acceptsEod.begin(), acceptsEod.end(), acceptsEodTable,
                  transform_offset_fn);
@ -2172,7 +2197,7 @@ struct Factory {
        limex->squashCount = verify_u32(squash.size());
        limex->squashOffset = squashOffset;
        DEBUG_PRINTF("NFA has %zu report squash masks\n", squash.size());
-        tableRow_t *mask = (tableRow_t *)(limex_base + squashOffset);
+        tableRow_t *mask = reinterpret_cast<tableRow_t *>(limex_base + squashOffset);
        assert(ISALIGNED(mask));
        for (size_t i = 0, end = squash.size(); i < end; i++) {
            maskSetBits(mask[i], squash[i]);
@ -2194,13 +2219,13 @@ struct Factory {
        for (u32 i = 0; i < num_repeats; i++) {
            repeatOffsets[i] = offset;
            assert(repeats[i]);
-            memcpy((char *)limex + offset, repeats[i].get(), repeats[i].size());
+            memcpy(reinterpret_cast<char *>(limex) + offset, repeats[i].get(), repeats[i].size());
            offset += repeats[i].size();
        }

        // Write repeat offset lookup table.
-        assert(ISALIGNED_N((char *)limex + repeatOffsetsOffset, alignof(u32)));
-        copy_bytes((char *)limex + repeatOffsetsOffset, repeatOffsets);
+        assert(ISALIGNED_N(reinterpret_cast<char *>(limex) + repeatOffsetsOffset, alignof(u32)));
+        copy_bytes(reinterpret_cast<char *>(limex) + repeatOffsetsOffset, repeatOffsets);

        limex->repeatOffset = repeatOffsetsOffset;
        limex->repeatCount = num_repeats;
@ -2210,15 +2235,15 @@ struct Factory {
    void writeReportList(const vector<ReportID> &reports, implNFA_t *limex,
                         const u32 reportListOffset) {
        DEBUG_PRINTF("reportListOffset=%u\n", reportListOffset);
-        assert(ISALIGNED_N((char *)limex + reportListOffset,
+        assert(ISALIGNED_N(reinterpret_cast<char *>(limex) + reportListOffset,
                           alignof(ReportID)));
-        copy_bytes((char *)limex + reportListOffset, reports);
+        copy_bytes(reinterpret_cast<char *>(limex) + reportListOffset, reports);
    }

    static
    bytecode_ptr<NFA> generateNfa(const build_info &args) {
        if (args.num_states > NFATraits<dtype>::maxStates) {
-            return nullptr;
+            return bytecode_ptr<NFA>(nullptr);
        }

        // Build bounded repeat structures.
@ -2321,7 +2346,7 @@ struct Factory {
        auto nfa = make_zeroed_bytecode_ptr<NFA>(nfaSize);
        assert(nfa); // otherwise we would have thrown std::bad_alloc

-        implNFA_t *limex = (implNFA_t *)getMutableImplNfa(nfa.get());
+        implNFA_t *limex = reinterpret_cast<implNFA_t *>(getMutableImplNfa(nfa.get()));
        assert(ISALIGNED(limex));

        writeReachMapping(reach, reachMap, limex, reachOffset);
@ -2453,6 +2478,7 @@ bool isSane(const NGHolder &h, const map<u32, set<NFAVertex>> &tops,
            return false;
        }
        const u32 i = state_ids.at(v);
+        // cppcheck-suppress knownConditionTrueFalse
        if (i == NO_STATE) {
            continue;
        }
@ -2533,6 +2559,7 @@ bool isFast(const build_info &args) {
                    continue;
                }
                u32 j = args.state_ids.at(w);
+                // cppcheck-suppress knownConditionTrueFalse
                if (j == NO_STATE) {
                    continue;
                }
@ -2577,7 +2604,7 @@ bytecode_ptr<NFA> generate(NGHolder &h,

    if (!cc.grey.allowLimExNFA) {
        DEBUG_PRINTF("limex not allowed\n");
-        return nullptr;
+        return bytecode_ptr<NFA>(nullptr);
    }

    // If you ask for a particular type, it had better be an NFA.
@ -2612,7 +2639,7 @@ bytecode_ptr<NFA> generate(NGHolder &h,

    if (scores.empty()) {
        DEBUG_PRINTF("No NFA returned a valid score for this case.\n");
-        return nullptr;
+        return bytecode_ptr<NFA>(nullptr);
    }

    // Sort acceptable models in priority order, lowest score first.
@ -2631,7 +2658,7 @@ bytecode_ptr<NFA> generate(NGHolder &h,
    }

    DEBUG_PRINTF("NFA build failed.\n");
-    return nullptr;
+    return bytecode_ptr<NFA>(nullptr);
 }

 u32 countAccelStates(NGHolder &h,
--- a/src/nfa/limex_dump.cpp
+++ b/src/nfa/limex_dump.cpp
@ -108,14 +108,14 @@ void dumpRepeats(const limex_type *limex, u32 model_size, FILE *f) {
    fprintf(f, "\n");
    fprintf(f, "%u bounded repeats.\n", limex->repeatCount);

-    const char *base = (const char *)limex;
-    const u32 *repeatOffset = (const u32 *)(base + limex->repeatOffset);
+    const char *base = reinterpret_cast<const char *>(limex);
+    const u32 *repeatOffset = reinterpret_cast<const u32 *>(base + limex->repeatOffset);

    for (u32 i = 0; i < limex->repeatCount; i++) {
        const NFARepeatInfo *info =
-            (const NFARepeatInfo *)(base + repeatOffset[i]);
+            reinterpret_cast<const NFARepeatInfo *>(base + repeatOffset[i]);
        const RepeatInfo *repeat =
-            (const RepeatInfo *)((const char *)info + sizeof(*info));
+            reinterpret_cast<const RepeatInfo *>(reinterpret_cast<const char *>(info) + sizeof(*info));
        fprintf(f, "  repeat %u: %s {%u,%u} packedCtrlSize=%u, "
                   "stateSize=%u\n",
                i, repeatTypeName(repeat->type), repeat->repeatMin,
@ -123,7 +123,7 @@ void dumpRepeats(const limex_type *limex, u32 model_size, FILE *f) {
        fprintf(f, "    nfa state: stream offset %u\n", info->stateOffset);
        fprintf(f, "    ");

-        const u8 *tug_mask = (const u8 *)info + info->tugMaskOffset;
+        const u8 *tug_mask = reinterpret_cast<const u8 *>(info) + info->tugMaskOffset;
        dumpMask(f, "tugs", tug_mask, model_size);
    }

@ -136,7 +136,7 @@ void dumpLimexReachMasks(u32 model_size, const u8 *reach, u32 reachCount,
    for (u32 i = 0; i < reachCount; i++) {
        char tmp_common[100];
        const u8 *row = reach + (i * (model_size/8));
-        sprintf(tmp_common, "reach mask %u ", i);
+        snprintf(tmp_common, sizeof(tmp_common), "reach mask %u ", i);
        dumpMask(f, tmp_common, row, model_size);
    }
 }
@ -157,7 +157,7 @@ void dumpLimexReachMap(const u8 *reachMap, FILE *f) {
 template<typename limex_type>
 static
 const NFA *limex_to_nfa(const limex_type *limex) {
-    return (const NFA *)((const char *)limex - sizeof(NFA));
+    return reinterpret_cast<const NFA *>(reinterpret_cast<const char *>(limex) - sizeof(NFA));
 }

 template<typename limex_type>
@ -172,8 +172,8 @@ void dumpAccel(const limex_type *limex, FILE *f) {

    u32 tableOffset = limex->accelTableOffset;
    u32 auxOffset = limex->accelAuxOffset;
-    const u8 *accelTable = (const u8 *)((const char *)limex + tableOffset);
-    const AccelAux *aux = (const AccelAux *)((const char *)limex + auxOffset);
+    const u8 *accelTable = reinterpret_cast<const u8 *>(reinterpret_cast<const char *>(limex) + tableOffset);
+    const AccelAux *aux = reinterpret_cast<const AccelAux *>(reinterpret_cast<const char *>(limex) + auxOffset);

    for (u32 i = 0; i < limex->accelCount; i++) {
        fprintf(f, "  accel %u (aux entry %u): ", i, accelTable[i]);
@ -191,7 +191,7 @@ void dumpAcceptList(const char *limex_base, const struct NFAAccept *accepts,
            continue;
        }
        fprintf(f, "  idx %u fires report list %u:", i, a.reports);
-        const ReportID *report = (const ReportID *)(limex_base + a.reports);
+        const ReportID *report = reinterpret_cast<const ReportID *>(limex_base + a.reports);
        for (; *report != MO_INVALID_IDX; report++) {
            fprintf(f, " %u", *report);
        }
@ -202,18 +202,18 @@ void dumpAcceptList(const char *limex_base, const struct NFAAccept *accepts,
 template<typename limex_type>
 static
 void dumpAccepts(const limex_type *limex, FILE *f) {
-    const char *limex_base = (const char *)limex;
+    const char *limex_base = reinterpret_cast<const char *>(limex);

    const u32 acceptCount = limex->acceptCount;
    const u32 acceptEodCount = limex->acceptEodCount;

    fprintf(f, "\n%u accepts.\n", acceptCount);
    const auto *accepts =
-        (const struct NFAAccept *)(limex_base + limex->acceptOffset);
+        reinterpret_cast<const struct NFAAccept *>(limex_base + limex->acceptOffset);
    dumpAcceptList(limex_base, accepts, acceptCount, f);
    fprintf(f, "\n%u accepts at EOD.\n", acceptEodCount);
    const auto *accepts_eod =
-        (const struct NFAAccept *)(limex_base + limex->acceptEodOffset);
+        reinterpret_cast<const struct NFAAccept *>(limex_base + limex->acceptEodOffset);
    dumpAcceptList(limex_base, accepts_eod, acceptEodCount, f);
    fprintf(f, "\n");
 }
@ -224,7 +224,7 @@ void dumpSquash(const limex_type *limex, FILE *f) {
    u32 size = limex_traits<limex_type>::size;

    // Dump squash masks, if there are any.
-    const u8 *squashMask = (const u8 *)limex + limex->squashOffset;
+    const u8 *squashMask = reinterpret_cast<const u8 *>(limex) + limex->squashOffset;
    for (u32 i = 0; i < limex->squashCount; i++) {
        std::ostringstream name;
        name << "squash_" << i;
@ -238,7 +238,7 @@ static
 const typename limex_traits<limex_type>::exception_type *
 getExceptionTable(const limex_type *limex) {
    return (const typename limex_traits<limex_type>::exception_type *)
-        ((const char *)limex + limex->exceptionOffset);
+        (reinterpret_cast<const char *>(limex) + limex->exceptionOffset);
 }

 template<typename limex_type>
@ -248,7 +248,7 @@ void dumpLimexExceptions(const limex_type *limex, FILE *f) {
                getExceptionTable(limex);
    const u32 size = limex_traits<limex_type>::size;

-    const char *limex_base = (const char *)limex;
+    const char *limex_base = reinterpret_cast<const char *>(limex);

    fprintf(f, "\n");
    for (u32 i = 0; i < limex->exceptionCount; i++) {
@ -259,13 +259,13 @@ void dumpLimexExceptions(const limex_type *limex, FILE *f) {
        case LIMEX_TRIGGER_POS: fprintf(f, "  trigger: POS\n"); break;
        default: break;
        }
-        dumpMask(f, "succ", (const u8 *)&e[i].successors, size);
-        dumpMask(f, "squash", (const u8 *)&e[i].squash, size);
+        dumpMask(f, "succ", reinterpret_cast<const u8 *>(&e[i].successors), size);
+        dumpMask(f, "squash", reinterpret_cast<const u8 *>(&e[i].squash), size);
        fprintf(f, "reports: ");
        if (e[i].reports == MO_INVALID_IDX) {
            fprintf(f, " <none>\n");
        } else {
-            const ReportID *r = (const ReportID *)(limex_base + e[i].reports);
+            const ReportID *r = reinterpret_cast<const ReportID *>(limex_base + e[i].reports);
            while (*r != MO_INVALID_IDX) {
                fprintf(f, " %u", *r++);
            }
@ -282,7 +282,7 @@ void dumpLimexShifts(const limex_type *limex, FILE *f) {
    fprintf(f, "Shift Masks:\n");
    for(u32 i = 0; i < limex->shiftCount; i++) {
        fprintf(f, "\t Shift %u(%hhu)\t\tMask: %s\n", i, limex->shiftAmount[i],
-                dumpMask((const u8 *)&limex->shift[i], size).c_str());
+                dumpMask(reinterpret_cast<const u8 *>(&limex->shift[i]), size).c_str());
    }
 }
 template<typename limex_type>
@ -304,20 +304,20 @@ void dumpLimexText(const limex_type *limex, FILE *f) {
    }
    fprintf(f, "\n\n");

-    dumpMask(f, "init", (const u8 *)&limex->init, size);
-    dumpMask(f, "init_dot_star", (const u8 *)&limex->initDS, size);
-    dumpMask(f, "accept", (const u8 *)&limex->accept, size);
-    dumpMask(f, "accept_at_eod", (const u8 *)&limex->acceptAtEOD, size);
-    dumpMask(f, "accel", (const u8 *)&limex->accel, size);
-    dumpMask(f, "accel_and_friends", (const u8 *)&limex->accel_and_friends,
+    dumpMask(f, "init", reinterpret_cast<const u8 *>(&limex->init), size);
+    dumpMask(f, "init_dot_star", reinterpret_cast<const u8 *>(&limex->initDS), size);
+    dumpMask(f, "accept", reinterpret_cast<const u8 *>(&limex->accept), size);
+    dumpMask(f, "accept_at_eod", reinterpret_cast<const u8 *>(&limex->acceptAtEOD), size);
+    dumpMask(f, "accel", reinterpret_cast<const u8 *>(&limex->accel), size);
+    dumpMask(f, "accel_and_friends", reinterpret_cast<const u8 *>(&limex->accel_and_friends),
             size);
-    dumpMask(f, "compress_mask", (const u8 *)&limex->compressMask, size);
-    dumpMask(f, "emask", (const u8 *)&limex->exceptionMask, size);
-    dumpMask(f, "zombie", (const u8 *)&limex->zombieMask, size);
+    dumpMask(f, "compress_mask", reinterpret_cast<const u8 *>(&limex->compressMask), size);
+    dumpMask(f, "emask", reinterpret_cast<const u8 *>(&limex->exceptionMask), size);
+    dumpMask(f, "zombie", reinterpret_cast<const u8 *>(&limex->zombieMask), size);

    // Dump top masks, if there are any.
    u32 topCount = limex->topCount;
-    const u8 *topMask = (const u8 *)limex + limex->topOffset;
+    const u8 *topMask = reinterpret_cast<const u8 *>(limex) + limex->topOffset;
    for (u32 i = 0; i < topCount; i++) {
        std::ostringstream name;
        name << "top_" << i;
@ -331,7 +331,7 @@ void dumpLimexText(const limex_type *limex, FILE *f) {
    dumpSquash(limex, f);

    dumpLimexReachMap(limex->reachMap, f);
-    dumpLimexReachMasks(size, (const u8 *)limex + sizeof(*limex) /* reach*/,
+    dumpLimexReachMasks(size, reinterpret_cast<const u8 *>(limex) + sizeof(*limex) /* reach*/,
                        limex->reachSize, f);

    dumpAccepts(limex, f);
@ -378,7 +378,7 @@ struct limex_labeller : public nfa_labeller {
    void label_state(FILE *f, u32 state) const override {
        const typename limex_traits<limex_type>::exception_type *exceptions
            = getExceptionTable(limex);
-        if (!testbit((const u8 *)&limex->exceptionMask,
+        if (!testbit(reinterpret_cast<const u8 *>(&limex->exceptionMask),
                     limex_traits<limex_type>::size, state)) {
            return;
        }
@ -404,11 +404,11 @@ static
 void dumpVertexDotInfo(const limex_type *limex, u32 state_count, FILE *f,
                       const nfa_labeller &labeller) {
    u32 size = sizeof(limex->init) * 8;
-    const u8 *reach = (const u8 *)limex + sizeof(*limex);
+    const u8 *reach = reinterpret_cast<const u8 *>(limex) + sizeof(*limex);
    vector<CharReach> perStateReach;
    setupReach(limex->reachMap, reach, size, state_count, &perStateReach);

-    const u8 *topMask = (const u8 *)limex + limex->topOffset;
+    const u8 *topMask = reinterpret_cast<const u8 *>(limex) + limex->topOffset;

    for (u32 state = 0; state < state_count; state++) {
        fprintf(f, "%u [ width = 1, fixedsize = true, fontsize = 12, "
@ -419,15 +419,15 @@ void dumpVertexDotInfo(const limex_type *limex, u32 state_count, FILE *f,
        // bung in another couple lines to push char class (the widest thing) up a bit
        fprintf(f, "\\n\\n\" ];\n");

-        if (testbit((const u8 *)&limex->acceptAtEOD, size, state)) {
+        if (testbit(reinterpret_cast<const u8 *>(&limex->acceptAtEOD), size, state)) {
            fprintf(f, "%u [ shape = box ];\n", state);
-        } else if (testbit((const u8 *)&limex->accept, size, state)) {
+        } else if (testbit(reinterpret_cast<const u8 *>(&limex->accept), size, state)) {
            fprintf(f, "%u [ shape = doublecircle ];\n", state);
        }
-        if (testbit((const u8 *)&limex->accel, size, state)) {
+        if (testbit(reinterpret_cast<const u8 *>(&limex->accel), size, state)) {
            fprintf(f, "%u [ color = red style = diagonals];\n", state);
        }
-        if (testbit((const u8 *)&limex->init, size, state)) {
+        if (testbit(reinterpret_cast<const u8 *>(&limex->init), size, state)) {
            fprintf(f, "START -> %u [ color = grey ];\n", state);
        }

@ -447,7 +447,7 @@ template<typename limex_type>
 static
 void dumpExDotInfo(const limex_type *limex, u32 state, FILE *f) {
    u32 size = limex_traits<limex_type>::size;
-    if (!testbit((const u8 *)&limex->exceptionMask, size, state)) {
+    if (!testbit(reinterpret_cast<const u8 *>(&limex->exceptionMask), size, state)) {
        return; /* not exceptional */
    }

@ -461,10 +461,10 @@ void dumpExDotInfo(const limex_type *limex, u32 state, FILE *f) {
    u32 state_count = limex_to_nfa(limex)->nPositions;

    for (u32 j = 0; j < state_count; j++) {
-        if (testbit((const u8 *)&e->successors, size, j)) {
+        if (testbit(reinterpret_cast<const u8 *>(&e->successors), size, j)) {
            fprintf(f, "%u -> %u [color = blue];\n", state, j);
        }
-        if (!testbit((const u8 *)&e->squash, size, j)) {
+        if (!testbit(reinterpret_cast<const u8 *>(&e->squash), size, j)) {
            fprintf(f, "%u -> %u [color = grey style = dashed];\n", state, j);
        }
    }
@ -480,7 +480,7 @@ static
 void dumpLimDotInfo(const limex_type *limex, u32 state, FILE *f) {
    for (u32 j = 0; j < limex->shiftCount; j++) {
        const u32 shift_amount = limex->shiftAmount[j];
-        if (testbit((const u8 *)&limex->shift[j],
+        if (testbit(reinterpret_cast<const u8 *>(&limex->shift[j]),
                    limex_traits<limex_type>::size, state)) {
            fprintf(f, "%u -> %u;\n", state, state + shift_amount);
        }
@ -502,7 +502,7 @@ void dumpLimexDot(const NFA *nfa, const limex_type *limex, FILE *f) {

 #define LIMEX_DUMP_FN(size)                                                    \
    void nfaExecLimEx##size##_dump(const NFA *nfa, const string &base) {       \
-        auto limex = (const LimExNFA##size *)getImplNfa(nfa);                  \
+        auto limex = reinterpret_cast<const LimExNFA##size *>(getImplNfa(nfa));                  \
        dumpLimexText(limex, StdioFile(base + ".txt", "w"));                   \
        dumpLimexDot(nfa, limex, StdioFile(base + ".dot", "w"));               \
    }
--- a/src/nfa/limex_exceptional.h
+++ b/src/nfa/limex_exceptional.h
@ -302,8 +302,8 @@ int PE_FN(STATE_ARG, ESTATE_ARG, UNUSED u32 diffmask, STATE_T *succ,
    }
 #else
    // A copy of the estate as an array of GPR-sized chunks.
-    CHUNK_T chunks[sizeof(STATE_T) / sizeof(CHUNK_T)];
-    CHUNK_T emask_chunks[sizeof(STATE_T) / sizeof(CHUNK_T)];
+    CHUNK_T chunks[sizeof(STATE_T) / sizeof(CHUNK_T)];       // cppcheck-suppress duplicateExpression
+    CHUNK_T emask_chunks[sizeof(STATE_T) / sizeof(CHUNK_T)]; // cppcheck-suppress duplicateExpression
 #ifdef ESTATE_ON_STACK
    memcpy(chunks, &estate, sizeof(STATE_T));
 #else
@ -311,7 +311,7 @@ int PE_FN(STATE_ARG, ESTATE_ARG, UNUSED u32 diffmask, STATE_T *succ,
 #endif
    memcpy(emask_chunks, &limex->exceptionMask, sizeof(STATE_T));

-    u32 base_index[sizeof(STATE_T) / sizeof(CHUNK_T)];
+    u32 base_index[sizeof(STATE_T) / sizeof(CHUNK_T)];       // cppcheck-suppress duplicateExpression
    base_index[0] = 0;
    for (s32 i = 0; i < (s32)ARRAY_LENGTH(base_index) - 1; i++) {
        base_index[i + 1] = base_index[i] + POPCOUNT_FN(emask_chunks[i]);
@ -322,6 +322,7 @@ int PE_FN(STATE_ARG, ESTATE_ARG, UNUSED u32 diffmask, STATE_T *succ,
 #ifdef ARCH_64_BIT
        t >>= 1; // Due to diffmask64, which leaves holes in the bitmask.
 #endif
+	// cppcheck-suppress unsignedLessThanZero
        assert(t < ARRAY_LENGTH(chunks));
        CHUNK_T word = chunks[t];
        assert(word != 0);
--- a/src/nfa/limex_internal.h
+++ b/src/nfa/limex_internal.h
@ -163,12 +163,12 @@ struct LimExNFA##size {                                                     \
    m512 exceptionAndMask; /**< exception and mask */                       \
 };

-CREATE_NFA_LIMEX(32)
-CREATE_NFA_LIMEX(64)
-CREATE_NFA_LIMEX(128)
-CREATE_NFA_LIMEX(256)
-CREATE_NFA_LIMEX(384)
-CREATE_NFA_LIMEX(512)
+CREATE_NFA_LIMEX(32)        //NOLINT (clang-analyzer-optin.performance.Padding)
+CREATE_NFA_LIMEX(64)        //NOLINT (clang-analyzer-optin.performance.Padding)
+CREATE_NFA_LIMEX(128)       //NOLINT (clang-analyzer-optin.performance.Padding)
+CREATE_NFA_LIMEX(256)       //NOLINT (clang-analyzer-optin.performance.Padding)
+CREATE_NFA_LIMEX(384)       //NOLINT (clang-analyzer-optin.performance.Padding)
+CREATE_NFA_LIMEX(512)       //NOLINT (clang-analyzer-optin.performance.Padding)

 /** \brief Structure describing a bounded repeat within the LimEx NFA.
 *
--- a/src/nfa/limex_runtime_impl.h
+++ b/src/nfa/limex_runtime_impl.h
@ -927,7 +927,7 @@ char JOIN(LIMEX_API_ROOT, _testEOD)(const struct NFA *n, const char *state,
                      context);
 }

-char JOIN(LIMEX_API_ROOT, _reportCurrent)(const struct NFA *n, struct mq *q) {
+char JOIN(LIMEX_API_ROOT, _reportCurrent)(const struct NFA *n, const struct mq *q) {
    const IMPL_NFA_T *limex = getImplNfa(n);
    REPORTCURRENT_FN(limex, q);
    return 1;
@ -984,9 +984,9 @@ char JOIN(LIMEX_API_ROOT, _inAccept)(const struct NFA *nfa,
    assert(q->state && q->streamState);

    const IMPL_NFA_T *limex = getImplNfa(nfa);
-    union RepeatControl *repeat_ctrl =
+    const union RepeatControl *repeat_ctrl =
        getRepeatControlBase(q->state, sizeof(STATE_T));
-    char *repeat_state = q->streamState + limex->stateSize;
+    const char *repeat_state = q->streamState + limex->stateSize;
    STATE_T state = *(STATE_T *)q->state;
    u64a offset = q->offset + q_last_loc(q) + 1;

@ -999,9 +999,9 @@ char JOIN(LIMEX_API_ROOT, _inAnyAccept)(const struct NFA *nfa, struct mq *q) {
    assert(q->state && q->streamState);

    const IMPL_NFA_T *limex = getImplNfa(nfa);
-    union RepeatControl *repeat_ctrl =
+    const union RepeatControl *repeat_ctrl =
        getRepeatControlBase(q->state, sizeof(STATE_T));
-    char *repeat_state = q->streamState + limex->stateSize;
+    const char *repeat_state = q->streamState + limex->stateSize;
    STATE_T state = *(STATE_T *)q->state;
    u64a offset = q->offset + q_last_loc(q) + 1;

@ -1020,9 +1020,9 @@ enum nfa_zombie_status JOIN(LIMEX_API_ROOT, _zombie_status)(

    if (limex->repeatCount) {
        u64a offset = q->offset + loc + 1;
-        union RepeatControl *repeat_ctrl =
+        const union RepeatControl *repeat_ctrl =
            getRepeatControlBase(q->state, sizeof(STATE_T));
-        char *repeat_state = q->streamState + limex->stateSize;
+        const char *repeat_state = q->streamState + limex->stateSize;
        SQUASH_UNTUG_BR_FN(limex, repeat_ctrl, repeat_state, offset, &state);
    }

--- a/src/nfa/mcclellan_common_impl.h
+++ b/src/nfa/mcclellan_common_impl.h
@ -134,7 +134,6 @@ u16 doWide16(const char *wide_entry, const u8 **c_inout, const u8 *end,
        len_c -= 16;
    }

-    pos = 0;
    // at least one in (0, 16).
    u32 loadLength_w = MIN(len_w, 16);
    u32 loadLength_c = MIN(len_c, 16);
--- a/src/nfa/mcclellan_internal.h
+++ b/src/nfa/mcclellan_internal.h
@ -106,9 +106,10 @@ static really_inline
 const char *findShermanState(UNUSED const struct mcclellan *m,
                             const char *sherman_base_offset, u32 sherman_base,
                             u32 s) {
-    const char *rv
-        = sherman_base_offset + SHERMAN_FIXED_SIZE * (s - sherman_base);
+    const char *rv = sherman_base_offset + SHERMAN_FIXED_SIZE * (s - sherman_base);
+    // cppcheck-suppress cstyleCast
    assert(rv < (const char *)m + m->length - sizeof(struct NFA));
+    // cppcheck-suppress cstyleCast
    UNUSED u8 type = *(const u8 *)(rv + SHERMAN_TYPE_OFFSET);
    assert(type == SHERMAN_STATE);
    return rv;
@ -123,13 +124,15 @@ char *findMutableShermanState(char *sherman_base_offset, u16 sherman_base,
 static really_inline
 const char *findWideEntry8(UNUSED const struct mcclellan *m,
                           const char *wide_base, u32 wide_limit, u32 s) {
+    // cppcheck-suppress cstyleCast
    UNUSED u8 type = *(const u8 *)wide_base;
    assert(type == WIDE_STATE);
-    const u32 entry_offset
-        = *(const u32 *)(wide_base
+    // cppcheck-suppress cstyleCast
+    const u32 entry_offset = *(const u32 *)(wide_base
        + WIDE_ENTRY_OFFSET8((s - wide_limit) * sizeof(u32)));

    const char *rv = wide_base + entry_offset;
+    // cppcheck-suppress cstyleCast
    assert(rv < (const char *)m + m->length - sizeof(struct NFA));
    return rv;
 }
@ -137,21 +140,23 @@ const char *findWideEntry8(UNUSED const struct mcclellan *m,
 static really_inline
 const char *findWideEntry16(UNUSED const struct mcclellan *m,
                            const char *wide_base, u32 wide_limit, u32 s) {
+    // cppcheck-suppress cstyleCast
    UNUSED u8 type = *(const u8 *)wide_base;
    assert(type == WIDE_STATE);
-    const u32 entry_offset
-        = *(const u32 *)(wide_base
+    // cppcheck-suppress cstyleCast
+    const u32 entry_offset = *(const u32 *)(wide_base
        + WIDE_ENTRY_OFFSET16((s - wide_limit) * sizeof(u32)));

    const char *rv = wide_base + entry_offset;
+    // cppcheck-suppress cstyleCast
    assert(rv < (const char *)m + m->length - sizeof(struct NFA));
    return rv;
 }

 static really_inline
 char *findMutableWideEntry16(char *wide_base, u32 wide_limit, u32 s) {
-    u32 entry_offset
-        = *(const u32 *)(wide_base
+    // cppcheck-suppress cstyleCast
+    u32 entry_offset = *(const u32 *)(wide_base
        + WIDE_ENTRY_OFFSET16((s - wide_limit) * sizeof(u32)));

    return wide_base + entry_offset;
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`Subproject commit 416091ebdb9e901b29d026633e73167d6353a0b0`